def process_outlier(decision_frontier, non_outlier_values_sample, term_value_count, terms, aggregator_value, ii, term_value, model_settings): # Extract fields from raw document fields = es.extract_fields_from_document(terms[aggregator_value]["raw_docs"][ii]) observations = terms[aggregator_value]["observations"][ii] observations["non_outlier_values_sample"] = non_outlier_values_sample observations["aggregator"] = aggregator_value observations["term"] = term_value observations["term_count"] = term_value_count observations["decision_frontier"] = decision_frontier observations["trigger_method"] = str(model_settings["trigger_method"]) observations["confidence"] = np.abs(decision_frontier - term_value_count) merged_fields_and_observations = helpers.utils.merge_two_dicts(fields, observations) outlier_summary = helpers.utils.replace_placeholder_fields_with_values(model_settings["outlier_summary"], merged_fields_and_observations) outlier_assets = helpers.utils.extract_outlier_asset_information(fields, settings) if len(outlier_assets) > 0: observations["assets"] = outlier_assets outlier = Outlier(type=model_settings["outlier_type"], reason=model_settings["outlier_reason"], summary=outlier_summary) for k, v in observations.items(): outlier.add_observation(k, v) es.process_outliers(doc=terms[aggregator_value]["raw_docs"][ii], outliers=[outlier], should_notify=model_settings["should_notify"]) return outlier
def test_add_outlier_to_doc(self): test_outlier = Outlier(type="dummy type", reason="dummy reason", summary="dummy summary") test_outlier.add_observation(field_name="observation", field_value="dummy observation") doc_with_outlier = helpers.es.add_outlier_to_document( doc_without_outlier_test_file, test_outlier) self.assertDictEqual(doc_with_outlier_test_file, doc_with_outlier)
def evaluate_batch_for_outliers(w2v_model=None, eval_sentences=None, raw_docs=None, model_settings=None): # Initialize outliers = list() # all_words_probs: contains an array of arrays. the nested arrays contain the probabilities of a word on that index to have a certain probability, in the context of another word sentence_probs = w2v_model.evaluate_sentences(eval_sentences) for i, single_sentence_prob in enumerate(sentence_probs): # If the probability is nan, it means that the sentence could not be evaluated, and we can't reason about it. # This happens for example whenever the sentence is made up entirely of words that aren't known to the trained model. if single_sentence_prob is np.nan: continue unique_probs = list(set(sentence_probs)) # if is_outlier_cutoff_percentage(single_sentence_prob, cutoff=0.005): # if is_outlier_std(single_sentence_prob, unique_probs, model_settings): if is_outlier_mad(single_sentence_prob, unique_probs, model_settings): outlier_summary = model_settings["outlier_summary"] # Extract fields from raw document fields = es.extract_fields_from_document(raw_docs[i]) outlier_summary = replace_placeholder_string_with_fields( outlier_summary, fields) outlier = Outlier(type=model_settings["outlier_type"], reason=model_settings["outlier_reason"], summary=outlier_summary) outlier.add_observation("probability", str(single_sentence_prob)) outliers.append(outlier) es.process_outliers(doc=raw_docs[i], outliers=[outlier], should_notify=model_settings["should_notify"]) else: if w2v_model.use_test_data: logging.logger.info("Not an outlier: " + str(eval_sentences[i]) + " - " + str(single_sentence_prob)) return outliers
def test_add_two_outliers_to_doc(self): test_outlier = Outlier(type="dummy type", reason="dummy reason", summary="dummy summary") test_outlier.add_observation(field_name="observation", field_value="dummy observation") test_outlier_2 = Outlier(type="dummy type 2", reason="dummy reason 2", summary="dummy summary 2") test_outlier_2.add_observation(field_name="observation_2", field_value="dummy observation 2") doc = copy.deepcopy(doc_without_outlier_test_file) doc_with_outlier = helpers.es.add_outlier_to_document( doc, test_outlier) doc_with_two_outliers = helpers.es.add_outlier_to_document( doc_with_outlier, test_outlier_2) self.assertDictEqual(doc_with_two_outliers, doc_with_two_outliers_test_file)
def evaluate_batch_for_outliers(metrics=None, model_settings=None, last_batch=False): # Initialize outliers = list() remaining_metrics = metrics.copy() for i, aggregator_value in enumerate(metrics): # Check if we have sufficient data. if not, continue. Else, evaluate for outliers. if len(metrics[aggregator_value]["metrics"]) < 100 and last_batch is False: continue else: # Remove from remaining metrics, as we will be handling it in a second del remaining_metrics[aggregator_value] # Calculate the decision frontier decision_frontier = helpers.utils.get_decision_frontier(model_settings["trigger_method"], metrics[aggregator_value]["metrics"], model_settings["trigger_sensitivity"], model_settings["trigger_on"]) logging.logger.debug("using decision frontier " + str(decision_frontier) + " for aggregator " + str(aggregator_value) + " - " + model_settings["metric"]) logging.logger.debug("example metric from batch for " + metrics[aggregator_value]["observations"][0]["target"] + ": " + str(metrics[aggregator_value]["metrics"][0])) # Calculate all outliers in array for ii, metric_value in enumerate(metrics[aggregator_value]["metrics"]): is_outlier = helpers.utils.is_outlier(metric_value, decision_frontier, model_settings["trigger_on"]) if is_outlier: confidence = np.abs(decision_frontier - metric_value) # Extract fields from raw document fields = es.extract_fields_from_document(metrics[aggregator_value]["raw_docs"][ii]) observations = metrics[aggregator_value]["observations"][ii] merged_fields_and_observations = helpers.utils.merge_two_dicts(fields, observations) outlier_summary = helpers.utils.replace_placeholder_fields_with_values(model_settings["outlier_summary"], merged_fields_and_observations) outlier_assets = helpers.utils.extract_outlier_asset_information(fields, settings) if len(outlier_assets) > 0: observations["assets"] = outlier_assets outlier = Outlier(type=model_settings["outlier_type"], reason=model_settings["outlier_reason"], summary=outlier_summary) outlier.add_observation("metric", metric_value) outlier.add_observation("decision_frontier", decision_frontier) outlier.add_observation("confidence", confidence) for k, v in observations.items(): outlier.add_observation(k, v) outliers.append(outlier) es.process_outliers(doc=metrics[aggregator_value]["raw_docs"][ii], outliers=[outlier], should_notify=model_settings["should_notify"]) return outliers, remaining_metrics
def evaluate_model(model_name=None, model_settings=None): lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) total_events = es.count_documents(lucene_query=lucene_query) logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events) logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating simplequery model") outliers = list() for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) outlier_summary = replace_placeholder_string_with_fields( model_settings["outlier_summary"], fields) outlier_assets = helpers.utils.extract_outlier_asset_information( fields, settings) outlier = Outlier(type=model_settings["outlier_type"], reason=model_settings["outlier_reason"], summary=outlier_summary) if len(outlier_assets) > 0: outlier.add_observation("assets", outlier_assets) outliers.append(outlier) es.process_outliers(doc=doc, outliers=[outlier], should_notify=model_settings["should_notify"]) if len(outliers) > 0: unique_summaries = len( set(o.get_observation("summary") for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique]")