def train_model(self): search_query = es.filter_by_query_string( self.model_settings["es_query_filter"]) train_data = list() self.total_events = es.count_documents(search_query=search_query) training_data_size_pct = settings.config.getint( "machine_learning", "training_data_size_pct") training_data_size = self.total_events / 100 * training_data_size_pct logging.print_analysis_intro(event_type="training " + self.model_name, total_events=self.total_events) total_training_events = int(min(training_data_size, self.total_events)) logging.init_ticker(total_steps=total_training_events, desc=self.model_name + " - preparing SVM training set") for doc in es.scan(search_query=search_query): if len(train_data) < total_training_events: logging.tick() fields = es.extract_fields_from_document(doc) train_data.append(fields) else: # We have collected sufficient training data break # Now, train the model if len(train_data) > 0: pass # Train!! else: logging.logger.warning( "no sentences to train model on. Are you sure the sentence configuration is correctly defined?" )
def train_model(self): w2v_model = word2vec.Word2Vec(name=self.model_name) search_query = es.filter_by_query_string(self.model_settings["es_query_filter"]) sentences = list() self.total_events = es.count_documents(search_query=search_query) training_data_size_pct = settings.config.getint("machine_learning", "training_data_size_pct") training_data_size = self.total_events / 100 * training_data_size_pct logging.print_analysis_intro(event_type="training " + self.model_name, total_events=self.total_events) total_training_events = int(min(training_data_size, self.total_events)) logging.init_ticker(total_steps=total_training_events, desc=self.model_name + " - preparing word2vec training set") for doc in es.scan(search_query=search_query): if len(sentences) < total_training_events: logging.tick() fields = es.extract_fields_from_document(doc) if set(self.model_settings["sentence_format"]).issubset(fields.keys()): new_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["sentence_format"]) for sentence in new_sentences: sentences.append(tuple(sentence)) # Remove all duplicates from sentences for training - REMOVED FOR TESTING # sentences = list(sentences) else: # We have collected sufficient training data break # Now, train the model if len(sentences) > 0: w2v_model.train_model(sentences) else: logging.logger.warning("no sentences to train model on. Are you sure the sentence configuration is correctly defined?")
def evaluate_model(self): self.total_events, documents = es.count_and_scan_documents(index=self.model_settings["es_index"], search_query=self.search_query, model_settings=self.model_settings) self.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating terms model") if self.total_events > 0: current_batch = defaultdict() targets_for_next_batch = defaultdict() total_targets_in_batch = 0 for doc in documents: logging.tick() target_sentences, aggregator_sentences = self._compute_aggregator_and_target_value( doc, self.model_settings["target"]) if target_sentences is not None and aggregator_sentences is not None: # Add current document to current_batch current_batch = self._add_document_to_batch(current_batch, target_sentences, aggregator_sentences, doc) total_targets_in_batch += len(target_sentences) * len(aggregator_sentences) # Evaluate batch of events against the model is_last_batch = (logging.current_step == self.total_events) # Check if it is the last batch # Run if it is the last batch OR if the batch size is large enough if is_last_batch or total_targets_in_batch >= self.terms_batch_eval_size: # Display log message log_message = "evaluating batch of " + "{:,}".format(total_targets_in_batch) + " terms " if len(targets_for_next_batch) > 0: log_message += "(+ " + "{:,}".format(len(targets_for_next_batch)) + " terms from last batch) " log_message += "[" + "{:,}".format(logging.current_step) + " events processed]" logging.logger.info(log_message) # evaluate the current batch outliers_in_batch, targets_for_next_batch = self._evaluate_batch_for_outliers(batch=current_batch) if outliers_in_batch: unique_summaries_in_batch = len(set(o.outlier_dict["summary"] for o in outliers_in_batch)) logging.logger.info("processing " + "{:,}".format(len(outliers_in_batch)) + " outliers in batch [" + "{:,}".format(unique_summaries_in_batch) + " unique summaries]") for outlier in outliers_in_batch: self.process_outlier(outlier) else: logging.logger.info("no outliers processed in batch") # Reset data structures for next batch current_batch = targets_for_next_batch total_targets_in_batch = 0 self.print_analysis_summary()
def find_sudden_appearance(self, start_slide_win, end_slide_win): """ Find sudden apparition in aggregation defined by self.model_settings["aggregator"] of a term field defined by self.model_settings["target"] in events within the time window defined by start_slide_win and en_slide_win and create outliers. An event is considered as outlier when a term field appear for the first time after the (end_slide_win - self.jump_win) :param start_slide_win: start time of the time window :param end_slide_win: end time of the time window """ aggregator_buckets = es.scan_first_occur_documents(search_query=self.search_query, start_time=start_slide_win, end_time=end_slide_win, model_settings=self.model_settings) # Loop over the aggregations for aggregator_bucket in aggregator_buckets: target_buckets = aggregator_bucket["target"]["buckets"] # Loop over the documents in aggregation for doc in target_buckets: self.num_event_proc += doc["doc_count"] raw_doc = doc["top_doc"]["hits"]["hits"][0] fields = es.extract_fields_from_document(raw_doc, extract_derived_fields=self.model_settings[ "use_derived_fields"]) # convert the event timestamp in the right format event_timestamp = dateutil.parser.parse(fields[self.model_settings["timestamp_field"]], ignoretz=True) if event_timestamp > (end_slide_win - self.jump_win): # retrieve extra information extra_outlier_information = dict() extra_outlier_information["size_time_window"] = str(self.delta_slide_win) extra_outlier_information["start_time_window"] = str(start_slide_win) extra_outlier_information["end_time_window"] = str(end_slide_win) extra_outlier_information["aggregator"] = self.model_settings["aggregator"] extra_outlier_information["aggregator_value"] = aggregator_bucket["key"] extra_outlier_information["target"] = self.model_settings["target"] extra_outlier_information["target_value"] = doc["key"] extra_outlier_information["num_target_value_in_window"] = doc["doc_count"] outlier = self.create_outlier(fields, raw_doc, extra_outlier_information=extra_outlier_information) self.process_outlier(outlier) summary = "In aggregator '%s: %s', the field(s) '%s: %s' appear(s) " \ "suddenly at %s of the time window of size %s." % \ (", ".join(self.model_settings["aggregator"]), aggregator_bucket["key"], " ,".join(self.model_settings["target"]), doc["key"], str(event_timestamp), self.delta_slide_win) logging.logger.debug(summary) logging.tick(self.num_event_proc)
def evaluate_model(self): self.extract_extra_model_settings() # Train the model if self.model_settings["train_model"]: self.train_model() return w2v_model = word2vec.Word2Vec(name=self.model_name) search_query = es.filter_by_query_string(self.model_settings["es_query_filter"]) if not w2v_model.is_trained(): logging.logger.warning("model was not trained! Skipping analysis.") else: # Check if we need to run the test data instead of real data if w2v_model.use_test_data: logging.print_generic_intro("using test data instead of live data to evaluate model " + self.model_name) self.evaluate_test_sentences(w2v_model=w2v_model) return self.total_events = es.count_documents(search_query=search_query) logging.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating word2vec model") raw_docs = list() eval_sentences = list() for doc in es.scan(search_query=search_query): logging.tick() fields = es.extract_fields_from_document(doc) try: new_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["sentence_format"]) eval_sentences.extend(new_sentences) except KeyError: logging.logger.debug("skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]") continue for _ in new_sentences: raw_docs.append(doc) # Evaluate batch of events against the model if logging.current_step == self.total_events or len(eval_sentences) >= settings.config.getint("machine_learning", "word2vec_batch_eval_size"): logging.logger.info("evaluating batch of " + str(len(eval_sentences)) + " sentences") outliers = self.evaluate_batch_for_outliers(w2v_model=w2v_model, eval_sentences=eval_sentences, raw_docs=raw_docs) if len(outliers) > 0: unique_summaries = len(set(o.outlier_dict["summary"] for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") # Reset data structures for next batch raw_docs = list() eval_sentences = list()
def evaluate_model(model_name=None, model_settings=None): lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) total_events = es.count_documents(lucene_query=lucene_query) logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events) logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating beaconing model") eval_terms_array = defaultdict() total_terms_added = 0 outlier_batches_trend = 0 for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) try: target_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=model_settings["target"]) aggregator_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=model_settings["aggregator"]) will_process_doc = True except (KeyError, TypeError): logging.logger.debug("Skipping event which does not contain the target and aggregator fields we are processing. - [" + model_name + "]") will_process_doc = False if will_process_doc: observations = dict() for target_sentence in target_sentences: flattened_target_sentence = helpers.utils.flatten_sentence(target_sentence) for aggregator_sentence in aggregator_sentences: flattened_aggregator_sentence = helpers.utils.flatten_sentence(aggregator_sentence) eval_terms_array = add_term_to_batch(eval_terms_array, flattened_aggregator_sentence, flattened_target_sentence, observations, doc) total_terms_added += len(target_sentences) # Evaluate batch of events against the model last_batch = (logging.current_step == total_events) if last_batch or total_terms_added >= settings.config.getint("beaconing", "beaconing_batch_eval_size"): logging.logger.info("evaluating batch of " + "{:,}".format(total_terms_added) + " terms") outliers = evaluate_batch_for_outliers(terms=eval_terms_array, model_settings=model_settings) if len(outliers) > 0: unique_summaries = len(set(o.get_observation("summary") for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") outlier_batches_trend += 1 else: logging.logger.info("no outliers detected in batch") outlier_batches_trend -= 1 # Reset data structures for next batch eval_terms_array = defaultdict() total_terms_added = 0
def evaluate_model(model_name=None, model_settings=None): lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) total_events = es.count_documents(lucene_query=lucene_query) logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events) logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating simplequery model") for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) # Add your model logic here logging.logger.info(json.dumps(fields, indent=4))
def perform_analysis(): for name in settings.config.sections(): if name.startswith("terms_"): param, model_name = name.split("terms_", 1) should_test_model = settings.config.getboolean("general", "run_models") and settings.config.getboolean(name, "run_model") should_run_model = settings.config.getboolean("general", "test_models") and settings.config.getboolean(name, "test_model") if should_test_model or should_run_model: model_settings = extract_model_settings(name) if "*" in model_settings["target"]: original_model_name = model_name logging.logger.warning("running terms model in brute force mode, could take a long time!") lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) batch_size = settings.config.getint("terms", "terms_batch_eval_size") total_events = es.count_documents(lucene_query=lucene_query) logging.init_ticker(total_steps=min(total_events, batch_size), desc=model_name + " - extracting brute force fields") field_names = set() num_docs_processed = 0 for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) fields = helpers.utils.flatten_dict(fields) # skip all fields that are related to outliers, we don't want to brute force them for field_name in list(fields.keys()): # create list instead of iterator so we can mutate the dictionary when iterating if field_name.startswith('outliers.'): logging.logger.debug("not brute forcing outliers field " + str(field_name)) fields.pop(field_name) field_names.update(fields.keys()) if num_docs_processed == batch_size: break else: num_docs_processed += 1 logging.logger.info("going to brute force " + str(len(field_names)) + " fields") for field_name in field_names: model_name = original_model_name + " [" + field_name + "]" # only brute force nested fields, so not the top level fields such as timestamp, deployment name, etc. if "." in field_name: model_settings["target"] = list([field_name]) model_settings["brute_forced_field"] = field_name # so it can be added to the outlier events automatically evaluate_model(model_name=model_name, model_settings=model_settings, brute_force=True) else: evaluate_model(model_name=model_name, model_settings=model_settings)
def evaluate_model(self): model_filter = { "bool": { "filter": [{ "term": { "outliers.model_name.keyword": { "value": self.model_name } } }, { "term": { "outliers.model_type.keyword": { "value": "simplequery" } } }] } } exclude_hits_filter = {"bool": {"must_not": model_filter}} query = self.search_query if "filter" in query: query["filter"].append(exclude_hits_filter) else: query["filter"] = [exclude_hits_filter] self.total_events, documents = es.count_and_scan_documents( index=self.model_settings["es_index"], search_query=query, model_settings=self.model_settings) self.print_analysis_intro(event_type="evaluating " + self.model_type + "_" + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating " + self.model_type + " model") if self.total_events > 0: for doc in documents: logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) outlier = self.create_outlier(fields, doc) self.process_outlier(outlier) self.print_analysis_summary()
def evaluate_model(self): self.total_events = es.count_documents(search_query=self.search_query) logging.print_analysis_intro(event_type="evaluating " + self.config_section_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating " + self.model_type + " model") for doc in es.scan(search_query=self.search_query): logging.tick() fields = es.extract_fields_from_document(doc) self.process_outlier(fields, doc) self.print_analysis_summary()
def _calculate_target_fields_to_brute_force(self): batch_size = settings.config.getint("terms", "terms_batch_eval_size") self.total_events = es.count_documents( index=self.es_index, search_query=self.search_query, model_settings=self.model_settings) logging.init_ticker(total_steps=min(self.total_events, batch_size), desc=self.model_name + " - extracting brute force fields") field_names_to_brute_force = set() if self.total_events > 0: num_docs_processed = 0 for doc in es.scan(index=self.es_index, search_query=self.search_query, model_settings=self.model_settings): logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) fields = helpers.utils.flatten_dict(fields) # create list instead of iterator so we can mutate the dictionary when iterating for field_name in list(fields.keys()): # skip all fields that are related to outliers, we don't want to brute force them if field_name.startswith('outliers.'): logging.logger.debug( "not brute forcing outliers field " + str(field_name)) continue # only brute force nested fields, so not the top level fields such as timestamp, # deployment name, etc. if "." in field_name: field_names_to_brute_force.add(field_name) # only process a single batch of events in order to decide which fields to brute force if num_docs_processed == batch_size: break else: num_docs_processed += 1 logging.logger.info("going to brute force " + str(len(field_names_to_brute_force)) + " fields") return field_names_to_brute_force
def evaluate_model(model_name=None, model_settings=None): lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) total_events = es.count_documents(lucene_query=lucene_query) logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events) logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating simplequery model") outliers = list() for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) outlier_summary = replace_placeholder_string_with_fields( model_settings["outlier_summary"], fields) outlier_assets = helpers.utils.extract_outlier_asset_information( fields, settings) outlier = Outlier(type=model_settings["outlier_type"], reason=model_settings["outlier_reason"], summary=outlier_summary) if len(outlier_assets) > 0: outlier.add_observation("assets", outlier_assets) outliers.append(outlier) es.process_outliers(doc=doc, outliers=[outlier], should_notify=model_settings["should_notify"]) if len(outliers) > 0: unique_summaries = len( set(o.get_observation("summary") for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique]")
def train_model(self): train_data = list() self.total_events, documents = es.count_and_scan_documents( index=self.model_settings["es_index"], search_query=self.search_query, model_settings=self.model_settings) training_data_size_pct = settings.config.getint( "machine_learning", "training_data_size_pct") training_data_size = self.total_events / 100 * training_data_size_pct self.print_analysis_intro(event_type="training " + self.model_name, total_events=self.total_events) total_training_events = int(min(training_data_size, self.total_events)) logging.init_ticker(total_steps=total_training_events, desc=self.model_name + " - preparing training set") if self.total_events > 0: for doc in documents: if len(train_data) < total_training_events: logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) train_data.append(fields) else: # We have collected sufficient training data break # Now, train the model if train_data: pass # Train!! else: logging.logger.warning( "no sentences to train model on. Are you sure the sentence configuration is " + "correctly defined?")
def train_model(self, sentences): sentences, words_to_indices, indices_to_words, words = flatten_and_build_indices( sentences) # Global position within sentences array sentence_index = 0 vocabulary_size = len( set(words)) # Number of unique words in our vocabulary logging.logger.debug("number of training sentences: " + str(len(sentences))) logging.logger.debug("words: " + str(len(words))) logging.logger.debug("vocabulary size: " + str(vocabulary_size)) graph = tf.Graph() with graph.as_default(): with tf.name_scope('inputs'): # Placeholders are structures for feeding input values train_inputs = tf.placeholder(tf.int32, shape=[self.batch_size]) train_labels = tf.placeholder(tf.int32, shape=[self.batch_size, 1]) # Ops and variables pinned to the CPU because of missing GPU implementation with tf.device('/cpu:0'): # Define embedding matrix variable # Variables are the parameters of the model that are being optimized with tf.name_scope('embeddings'): embeddings = tf.Variable(tf.random_uniform( [vocabulary_size, self.embedding_size], -1.0, 1.0), name="embeddings") # Take an input vector of integer indices, # and “look up” these indices in the supplied embeddings tensor. embed = tf.nn.embedding_lookup(embeddings, train_inputs) # Construct the variables for the NCE loss with tf.name_scope('weights'): nce_weights = tf.Variable(tf.truncated_normal( [vocabulary_size, self.embedding_size], stddev=1.0 / math.sqrt(self.embedding_size)), name="weights") with tf.name_scope('biases'): nce_biases = tf.Variable(tf.zeros([vocabulary_size]), name="biases") # Compute the average NCE loss for the batch. # tf.nce_loss automatically draws a new sample of the negative labels each # time we evaluate the loss. with tf.name_scope('loss'): loss = tf.reduce_mean( tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, labels=train_labels, inputs=embed, num_sampled=self.num_sampled, num_classes=vocabulary_size)) # Add the loss value as a scalar to summary. tf.summary.scalar('loss', loss) # Construct the SGD optimizer using a learning rate of 1.0. with tf.name_scope('optimizer'): optimizer = tf.train.GradientDescentOptimizer(1.0).minimize( loss) # Compute the cosine similarity between minibatch examples and all embeddings. norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True), name="norm") # normalized_embeddings = embeddings / norm # Merge all summaries. merged = tf.summary.merge_all() # Add variable initializer. init = tf.global_variables_initializer() # Add saver # Save only latest model saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True) # BEGIN TRAINING logging.logger.info("training word2vec model using " + str(len(sentences)) + " samples") logging.init_ticker(total_steps=self.num_steps, desc=self.model_name + " - training word2vec model") with tf.Session(graph=graph) as session: # Open a writer to write summaries. writer = tf.summary.FileWriter(self.log_dir, session.graph) # We must initialize all variables before we use them. init.run() logging.logger.debug('Initialized all variables') logging.logger.debug(norm) average_loss = 0 average_historical_loss = list() step = 0 while step < self.num_steps: logging.tick() batch_inputs, batch_labels, sentence_index = generate_batch( self.batch_size, self.skip_window, sentences, sentence_index) feed_dict = { train_inputs: batch_inputs, train_labels: batch_labels } # Define metadata variable. run_metadata = tf.RunMetadata() # We perform one update step by evaluating the optimizer op (including it in the list of returned values for session.run()) # Also, evaluate the merged op to get all summaries from the returned "summary" variable. # Feed metadata variable to session for visualizing the graph in TensorBoard. _, summary, loss_val = session.run([optimizer, merged, loss], feed_dict=feed_dict, run_metadata=run_metadata) average_loss += loss_val # Add returned summaries to writer in each step. writer.add_summary(summary, step) # Add metadata to visualize the graph for the last run. if step == (self.num_steps - 1): writer.add_run_metadata(run_metadata, 'step%d' % step) if step % 1000 == 0: if step > 0: average_loss /= 1000 average_historical_loss.append(average_loss) # The average loss is an estimate of the loss over the last 1000 steps. logging.logger.info('average loss at step ' + str(step) + ': ' + str(average_loss)) average_loss = 0 # Check if historical loss is showing signs of improvement if len(average_historical_loss) >= 10: if np.std(average_historical_loss[-10:]) < 1: logging.logger.info( "loss seems to have stabilized, stopping training process" ) step = self.num_steps - 1 if step % self.save_every == 0: saver.save(session, self.meta_graph_dir) step = step + 1 # Save used embeddings together with the model with open(self.words_to_indices_filename, "w") as f: json.dump(words_to_indices, f) with open(self.indices_to_words_filename, "w") as f: json.dump(indices_to_words, f) # Write corresponding labels for the embeddings. with open(self.metadata_filename, 'w') as f: for i in range(vocabulary_size): f.write(indices_to_words[i] + '\n') # Create a configuration for visualizing embeddings with the labels in TensorBoard. config = projector.ProjectorConfig() embedding_conf = config.embeddings.add() embedding_conf.tensor_name = embeddings.name embedding_conf.metadata_path = self.metadata_filename projector.visualize_embeddings(writer, config) writer.close()
def evaluate_target(self, target, search_query, brute_force=False): self.total_events = es.count_documents(index=self.es_index, search_query=search_query) logging.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating terms model") if brute_force: logging.logger.info("brute forcing field %s", str(target[0])) eval_terms_array = defaultdict() total_terms_added = 0 outlier_batches_trend = 0 for doc in es.scan(index=self.es_index, search_query=search_query): logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) try: target_sentences = helpers.utils.flatten_fields_into_sentences( fields=fields, sentence_format=target) aggregator_sentences = helpers.utils.flatten_fields_into_sentences( fields=fields, sentence_format=self.model_settings["aggregator"]) will_process_doc = True except (KeyError, TypeError): logging.logger.debug( "Skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]") will_process_doc = False if will_process_doc: observations = dict() if brute_force: observations["brute_forced_field"] = self.model_settings[ "brute_forced_field"] for target_sentence in target_sentences: flattened_target_sentence = helpers.utils.flatten_sentence( target_sentence) for aggregator_sentence in aggregator_sentences: flattened_aggregator_sentence = helpers.utils.flatten_sentence( aggregator_sentence) eval_terms_array = self.add_term_to_batch( eval_terms_array, flattened_aggregator_sentence, flattened_target_sentence, observations, doc) total_terms_added += len(target_sentences) # Evaluate batch of events against the model last_batch = (logging.current_step == self.total_events) if last_batch or total_terms_added >= settings.config.getint( "terms", "terms_batch_eval_size"): logging.logger.info("evaluating batch of " + "{:,}".format(total_terms_added) + " terms") outliers = self.evaluate_batch_for_outliers( terms=eval_terms_array) if len(outliers) > 0: unique_summaries = len( set(o.outlier_dict["summary"] for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") outlier_batches_trend += 1 else: logging.logger.info("no outliers detected in batch") outlier_batches_trend -= 1 if outlier_batches_trend == -3 and brute_force: logging.logger.info( "too many batches without outliers, we are not going to continue brute forcing" ) break if outlier_batches_trend == 3 and brute_force: logging.logger.info( "too many batches with outliers, we are not going to continue brute forcing" ) break # Reset data structures for next batch eval_terms_array = defaultdict() total_terms_added = 0 self.print_analysis_summary()
def evaluate_model(self): batch = defaultdict() # Contain the current batch information remaining_metrics = defaultdict() total_metrics_in_batch = 0 self.total_events, documents = es.count_and_scan_documents( index=self.model_settings["es_index"], search_query=self.search_query, model_settings=self.model_settings) self.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating " + self.model_type + " model") if self.total_events > 0: for doc in documents: logging.tick() # Extract target and aggregator values target_value, aggregator_sentences = self._compute_aggregator_and_target_value( doc) # If target and aggregator values exist if target_value is not None and aggregator_sentences is not None: # Add current document to eval_metrics batch, metric_added = self._add_document_to_batch( doc, batch, target_value, aggregator_sentences) # We can only have 1 target field for metrics (as opposed to terms), so the total number of targets # added is the same as the total number of aggregator sentences that were processed for this # document if metric_added: total_metrics_in_batch += len(aggregator_sentences) is_last_batch = (logging.current_step == self.total_events ) # Check if it is the last batch # Run if it is the last batch OR if the batch size is large enough if is_last_batch or total_metrics_in_batch >= self.metrics_batch_eval_size: # Display log message log_message = "evaluating batch of " + "{:,}".format( total_metrics_in_batch) + " metrics " if remaining_metrics: log_message += "(+ " + "{:,}".format( len(remaining_metrics) ) + " metrics from last batch) " log_message += "[" + "{:,}".format( logging.current_step) + " events processed]" logging.logger.info(log_message) outliers_in_batch, remaining_metrics = self._evaluate_batch_for_outliers( batch=batch, is_last_batch=is_last_batch) # For each result, save it in batch and in ES if outliers_in_batch: unique_summaries_in_batch = len( set(o.outlier_dict["summary"] for o in outliers_in_batch)) logging.logger.info( "processing " + "{:,}".format(len(outliers_in_batch)) + " outliers in batch [" + "{:,}".format(unique_summaries_in_batch) + " unique summaries]") for outlier in outliers_in_batch: self.process_outlier(outlier) else: logging.logger.info("no outliers detected in batch") # Reset data structures for next batch batch = remaining_metrics total_metrics_in_batch = 0 self.print_analysis_summary()
def evaluate_model(self): self.extract_additional_model_settings() eval_metrics = defaultdict() total_metrics_added = 0 self.total_events = es.count_documents(search_query=self.search_query) logging.print_analysis_intro(event_type="evaluating " + self.config_section_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating " + self.model_type + " model") for doc in es.scan(search_query=self.search_query): logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) try: target_value = helpers.utils.flatten_sentence( helpers.utils.get_dotkey_value( fields, self.model_settings["target"], case_sensitive=True)) aggregator_sentences = helpers.utils.flatten_fields_into_sentences( fields=fields, sentence_format=self.model_settings["aggregator"]) except (KeyError, TypeError): logging.logger.debug( "skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]") continue metric, observations = self.calculate_metric( self.model_settings["metric"], target_value) if metric is not None: # explicitly check for none, since "0" can be OK as a metric! total_metrics_added += 1 for aggregator_sentence in aggregator_sentences: flattened_aggregator_sentence = helpers.utils.flatten_sentence( aggregator_sentence) eval_metrics = self.add_metric_to_batch( eval_metrics, flattened_aggregator_sentence, target_value, metric, observations, doc) # Evaluate batch of events against the model last_batch = (logging.current_step == self.total_events) if last_batch or total_metrics_added >= settings.config.getint( "metrics", "metrics_batch_eval_size"): logging.logger.info("evaluating batch of " + "{:,}".format(total_metrics_added) + " metrics [" + "{:,}".format(logging.current_step) + " events processed]") outliers, remaining_metrics = self.evaluate_batch_for_outliers( metrics=eval_metrics, model_settings=self.model_settings, last_batch=last_batch) if len(outliers) > 0: unique_summaries = len( set(o.outlier_dict["summary"] for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") else: logging.logger.info("no outliers detected in batch") # Reset data structures for next batch eval_metrics = remaining_metrics.copy() total_metrics_added = 0 self.print_analysis_summary()
def evaluate_model(self): self.extract_additional_model_settings() if "*" in self.model_settings["target"]: brute_force = False logging.logger.warning("running terms model in brute force mode, could take a long time!") search_query = es.filter_by_query_string(self.model_settings["es_query_filter"]) batch_size = settings.config.getint("terms", "terms_batch_eval_size") self.total_events = es.count_documents(search_query=search_query) logging.init_ticker(total_steps=min(self.total_events, batch_size), desc=self.model_name + " - extracting brute force fields") field_names = set() num_docs_processed = 0 for doc in es.scan(search_query=search_query): logging.tick() fields = es.extract_fields_from_document(doc) fields = helpers.utils.flatten_dict(fields) # skip all fields that are related to outliers, we don't want to brute force them for field_name in list(fields.keys()): # create list instead of iterator so we can mutate the dictionary when iterating if field_name.startswith('outliers.'): logging.logger.debug("not brute forcing outliers field " + str(field_name)) fields.pop(field_name) field_names.update(fields.keys()) # only process a single batch of events in order to decide which fields to brute force if num_docs_processed == batch_size: break else: num_docs_processed += 1 logging.logger.info("going to brute force " + str(len(field_names)) + " fields") for field_name in field_names: # only brute force nested fields, so not the top level fields such as timestamp, deployment name, etc. if "." in field_name: self.model_settings["target"] = list([field_name]) self.model_settings["brute_forced_field"] = field_name # so it can be added to the outlier events automatically brute_force = True else: brute_force = False if brute_force: search_query = es.filter_by_query_string(self.model_settings["es_query_filter"] + " AND _exists_:" + self.model_settings["brute_forced_field"]) else: search_query = es.filter_by_query_string(self.model_settings["es_query_filter"]) self.total_events = es.count_documents(search_query=search_query) logging.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating terms model") eval_terms_array = defaultdict() total_terms_added = 0 outlier_batches_trend = 0 for doc in es.scan(search_query=search_query): logging.tick() fields = es.extract_fields_from_document(doc) try: target_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["target"]) aggregator_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["aggregator"]) will_process_doc = True except (KeyError, TypeError): logging.logger.debug("Skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]") will_process_doc = False if will_process_doc: observations = dict() if brute_force: observations["brute_forced_field"] = self.model_settings["brute_forced_field"] for target_sentence in target_sentences: flattened_target_sentence = helpers.utils.flatten_sentence(target_sentence) for aggregator_sentence in aggregator_sentences: flattened_aggregator_sentence = helpers.utils.flatten_sentence(aggregator_sentence) eval_terms_array = self.add_term_to_batch(eval_terms_array, flattened_aggregator_sentence, flattened_target_sentence, observations, doc) total_terms_added += len(target_sentences) # Evaluate batch of events against the model last_batch = (logging.current_step == self.total_events) if last_batch or total_terms_added >= settings.config.getint("terms", "terms_batch_eval_size"): logging.logger.info("evaluating batch of " + "{:,}".format(total_terms_added) + " terms") outliers = self.evaluate_batch_for_outliers(terms=eval_terms_array) if len(outliers) > 0: unique_summaries = len(set(o.outlier_dict["summary"] for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") outlier_batches_trend += 1 else: logging.logger.info("no outliers detected in batch") outlier_batches_trend -= 1 if outlier_batches_trend == -3 and brute_force: logging.logger.info("too many batches without outliers, we are not going to continue brute forcing") break if outlier_batches_trend == 3 and brute_force: logging.logger.info("too many batches with outliers, we are not going to continue brute forcing") break # Reset data structures for next batch eval_terms_array = defaultdict() total_terms_added = 0 self.print_analysis_summary()
def evaluate_model(model_name=None, model_settings=None): lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) total_events = es.count_documents(lucene_query=lucene_query) logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events) logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating metrics model") eval_metrics = defaultdict() total_metrics_added = 0 for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) will_process_doc = False try: target_value = helpers.utils.flatten_sentence(helpers.utils.get_dotkey_value(fields, model_settings["target"], case_sensitive=True)) aggregator_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=model_settings["aggregator"]) will_process_doc = True except (KeyError, TypeError): logging.logger.debug("skipping event which does not contain the target and aggregator fields we are processing. - [" + model_name + "]") if will_process_doc: observations = dict() metric = None # ------------------------------------ # METRIC: Calculate numerical value # ------------------------------------ # Example: numerical_value("2") => 2 if model_settings["metric"] == "numerical_value": try: metric = float(target_value) total_metrics_added = total_metrics_added + 1 except ValueError: # number can not be casted to a Float, just continue pass # ------------------------------------ # METRIC: Calculate length of a string # ------------------------------------ # Example: length("outliers") => 8 if model_settings["metric"] == "length": metric = len(target_value) total_metrics_added = total_metrics_added + 1 # ------------------------------------- # METRIC: Calculate entropy of a string # ------------------------------------- # Example: entropy("houston") => 2.5216406363433186 if model_settings["metric"] == "entropy": metric = helpers.utils.shannon_entropy(target_value) total_metrics_added = total_metrics_added + 1 # ------------------------------------------------------------------------------------ # METRIC: Calculate total length of hexadecimal encoded substrings embedded in string # ------------------------------------------------------------------------------------ if model_settings["metric"] == "hex_encoded_length": hex_encoded_words = list() target_value_words = re.split("[^a-fA-F0-9+]", str(target_value)) # at least length 10 to have 5 encoded characters for word in target_value_words: if len(word) > 10 and helpers.utils.is_hex_encoded(word): # let's match at least 5 characters, meaning 10 hex digits hex_encoded_words.append(word) if len(hex_encoded_words) > 0: sorted_hex_encoded_words = sorted(hex_encoded_words, key=len) observations["max_hex_encoded_length"] = len(sorted_hex_encoded_words[-1]) observations["max_hex_encoded_word"] = sorted_hex_encoded_words[-1] metric = len(sorted_hex_encoded_words[-1]) else: metric = 0 total_metrics_added = total_metrics_added + 1 # ------------------------------------------------------------------------------------ # METRIC: Calculate total length of base64 encoded substrings embedded in string # ------------------------------------------------------------------------------------ # Example: base64_encoded_length("houston we have a cHJvYmxlbQ==") => base64_decoded_string: problem, base64_encoded_length: 7 if model_settings["metric"] == "base64_encoded_length": base64_decoded_words = list() # Split all non-Base64 characters, so we can try to convert them to Base64 decoded strings target_value_words = re.split("[^A-Za-z0-9+/=]", str(target_value)) for word in target_value_words: decoded_word = helpers.utils.is_base64_encoded(word) if decoded_word and len(decoded_word) >= 5: # let's match at least 5 characters, meaning 10 base64 digits base64_decoded_words.append(decoded_word) if len(base64_decoded_words) > 0: sorted_base64_decoded_words = sorted(base64_decoded_words, key=len) observations["max_base64_decoded_length"] = len(sorted_base64_decoded_words[-1]) observations["max_base64_decoded_word"] = sorted_base64_decoded_words[-1] metric = len(sorted_base64_decoded_words[-1]) else: metric = 0 total_metrics_added = total_metrics_added + 1 # --------------------------------------------------------- # METRIC: Calculate total length of URLs embedded in string # --------------------------------------------------------- # Example: url_length("why don't we go http://www.dance.com") => extracted_urls_length: 20, extracted_urls: http://www.dance.com if model_settings["metric"] == "url_length": extracted_urls_length = 0 extracted_urls = [] # if the target value is a list of strings, convert it into a single list of strings target_value_words = target_value.replace('"', ' ').split() # splits on whitespace by default, and on quotes, since we most likely will apply this to parameter arguments for word in target_value_words: is_url = helpers.utils.is_url(word) if is_url: extracted_urls_length += len(word) extracted_urls.append(word) if extracted_urls_length > 0: observations["extracted_urls_length"] = extracted_urls_length observations["extracted_urls"] = ','.join(extracted_urls) metric = extracted_urls_length total_metrics_added = total_metrics_added + 1 if metric is not None: # explicitly check for none, since "0" can be OK as a metric! for aggregator_sentence in aggregator_sentences: flattened_aggregator_sentence = helpers.utils.flatten_sentence(aggregator_sentence) eval_metrics = add_metric_to_batch(eval_metrics, flattened_aggregator_sentence, target_value, metric, observations, doc) # Evaluate batch of events against the model last_batch = (logging.current_step == total_events) if last_batch or total_metrics_added >= settings.config.getint("metrics", "metrics_batch_eval_size"): logging.logger.info("evaluating batch of " + "{:,}".format(total_metrics_added) + " metrics") outliers, remaining_metrics = evaluate_batch_for_outliers(metrics=eval_metrics, model_settings=model_settings, last_batch=last_batch) if len(outliers) > 0: unique_summaries = len(set(o.get_observation("summary") for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") else: logging.logger.info("no outliers detected in batch") # Reset data structures for next batch eval_metrics = remaining_metrics.copy() total_metrics_added = 0