Exemple #1
0
    def _compute_aggregator_and_target_value(self, doc):
        """
        Compute the target value and the aggregator sentence. Return the two value or two None if one of the two could
        not be computed

        :param doc: the document for which the calculations must be made
        :return: target_value (could be None), aggregator_sentences (could be None)
        """
        fields = es.extract_fields_from_document(
            doc,
            extract_derived_fields=self.model_settings["use_derived_fields"])
        try:
            target_value = helpers.utils.flatten_sentence(
                helpers.utils.get_dotkey_value(fields,
                                               self.model_settings["target"],
                                               case_sensitive=True))
            aggregator_sentences = helpers.utils.flatten_fields_into_sentences(
                fields=fields,
                sentence_format=self.model_settings["aggregator"])
        except (KeyError, TypeError):
            logging.logger.debug(
                "skipping event which does not contain the target and aggregator "
                + "fields we are processing. - [" + self.model_name + "]")
            return None, None

        return target_value, aggregator_sentences
Exemple #2
0
    def _create_outlier(self, non_outlier_values, term_value_count,
                        aggregator_value, term_value, decision_frontier, terms,
                        ii):
        non_outlier_values_sample = ",".join(
            random.sample(non_outlier_values, min(3, len(non_outlier_values))))

        observations = dict()
        observations["non_outlier_values_sample"] = non_outlier_values_sample
        observations["term_count"] = term_value_count
        observations["aggregator"] = aggregator_value
        observations["term"] = term_value
        observations["decision_frontier"] = decision_frontier
        observations["trigger_method"] = str(
            self.model_settings["trigger_method"])

        calculated_observations = terms[
            observations["aggregator"]]["observations"][ii]
        calculated_observations.update(observations)

        raw_doc = terms[observations["aggregator"]]["raw_docs"][ii]
        fields = es.extract_fields_from_document(
            raw_doc,
            extract_derived_fields=self.model_settings["use_derived_fields"])
        return self.process_outlier(
            fields, raw_doc, extra_outlier_information=calculated_observations)
Exemple #3
0
    def _create_outlier(self, raw_doc):
        """
        Create outlier from raw_doc

        :param raw_doc: raw document representing one hit event from an Elasticsearch request
        :return: the created outlier
        """
        extra_outlier_information = dict()
        if self.model_settings["highlight_match"]:
            extra_outlier_information["matched_fields"] = raw_doc["highlight"]

            matched_values = dict()
            for key, fields in raw_doc["highlight"].items():
                matched_values[key] = list()
                for field in fields:
                    # Find values between tags <value> and </value>
                    values = re.findall("<value>((.|\n)*?)</value>", field)
                    matched_values[key] = [value for value, _ in values]
            extra_outlier_information["matched_values"] = str(matched_values)
        fields = es.extract_fields_from_document(
            raw_doc,
            extract_derived_fields=self.model_settings["use_derived_fields"])
        return self.create_outlier(
            fields,
            raw_doc,
            extra_outlier_information=extra_outlier_information)
Exemple #4
0
    def evaluate_batch_for_outliers(self, w2v_model=None, eval_sentences=None, raw_docs=None):
        # Initialize
        outliers = list()

        # all_words_probs: contains an array of arrays. the nested arrays contain the probabilities of a word on that index to have a certain probability, in the context of another word
        sentence_probs = w2v_model.evaluate_sentences(eval_sentences)

        for i, single_sentence_prob in enumerate(sentence_probs):
            # If the probability is nan, it means that the sentence could not be evaluated, and we can't reason about it.
            # This happens for example whenever the sentence is made up entirely of words that aren't known to the trained model.
            if single_sentence_prob is np.nan:
                continue

            unique_probs = list(set(sentence_probs))

            decision_frontier = helpers.utils.get_decision_frontier(self.model_settings["trigger_method"], unique_probs, self.model_settings["trigger_sensitivity"], self.model_settings["trigger_on"])
            is_outlier = helpers.utils.is_outlier(single_sentence_prob, decision_frontier, self.model_settings["trigger_on"])
            if is_outlier:
                fields = es.extract_fields_from_document(raw_docs[i])
                outliers.append(self.process_outlier(fields, raw_docs[i], extra_outlier_information=None))
            else:
                if w2v_model.use_test_data:
                    logging.logger.info("Not an outlier: " + str(eval_sentences[i]) + " - " + str(single_sentence_prob))

        return outliers
Exemple #5
0
def process_outlier(decision_frontier, non_outlier_values_sample, term_value_count, terms, aggregator_value, ii, term_value, model_settings):
    # Extract fields from raw document
    fields = es.extract_fields_from_document(terms[aggregator_value]["raw_docs"][ii])

    observations = terms[aggregator_value]["observations"][ii]

    observations["non_outlier_values_sample"] = non_outlier_values_sample
    observations["aggregator"] = aggregator_value
    observations["term"] = term_value
    observations["term_count"] = term_value_count
    observations["decision_frontier"] = decision_frontier
    observations["trigger_method"] = str(model_settings["trigger_method"])
    observations["confidence"] = np.abs(decision_frontier - term_value_count)

    merged_fields_and_observations = helpers.utils.merge_two_dicts(fields, observations)

    outlier_summary = helpers.utils.replace_placeholder_fields_with_values(model_settings["outlier_summary"], merged_fields_and_observations)

    outlier_assets = helpers.utils.extract_outlier_asset_information(fields, settings)

    if len(outlier_assets) > 0:
        observations["assets"] = outlier_assets

    outlier = Outlier(type=model_settings["outlier_type"], reason=model_settings["outlier_reason"], summary=outlier_summary)

    for k, v in observations.items():
        outlier.add_observation(k, v)

    es.process_outliers(doc=terms[aggregator_value]["raw_docs"][ii], outliers=[outlier], should_notify=model_settings["should_notify"])
    return outlier
Exemple #6
0
    def train_model(self):
        w2v_model = word2vec.Word2Vec(name=self.model_name)
        search_query = es.filter_by_query_string(self.model_settings["es_query_filter"])

        sentences = list()

        self.total_events = es.count_documents(search_query=search_query)
        training_data_size_pct = settings.config.getint("machine_learning", "training_data_size_pct")
        training_data_size = self.total_events / 100 * training_data_size_pct

        logging.print_analysis_intro(event_type="training " + self.model_name, total_events=self.total_events)
        total_training_events = int(min(training_data_size, self.total_events))

        logging.init_ticker(total_steps=total_training_events, desc=self.model_name + " - preparing word2vec training set")
        for doc in es.scan(search_query=search_query):
            if len(sentences) < total_training_events:
                logging.tick()
                fields = es.extract_fields_from_document(doc)
                if set(self.model_settings["sentence_format"]).issubset(fields.keys()):
                    new_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["sentence_format"])
                    for sentence in new_sentences:
                        sentences.append(tuple(sentence))

                    # Remove all duplicates from sentences for training - REMOVED FOR TESTING
                    # sentences = list(sentences)
            else:
                # We have collected sufficient training data
                break

        # Now, train the model
        if len(sentences) > 0:
            w2v_model.train_model(sentences)
        else:
            logging.logger.warning("no sentences to train model on. Are you sure the sentence configuration is correctly defined?")
    def train_model(self):
        search_query = es.filter_by_query_string(
            self.model_settings["es_query_filter"])

        train_data = list()

        self.total_events = es.count_documents(search_query=search_query)
        training_data_size_pct = settings.config.getint(
            "machine_learning", "training_data_size_pct")
        training_data_size = self.total_events / 100 * training_data_size_pct

        logging.print_analysis_intro(event_type="training " + self.model_name,
                                     total_events=self.total_events)
        total_training_events = int(min(training_data_size, self.total_events))

        logging.init_ticker(total_steps=total_training_events,
                            desc=self.model_name +
                            " - preparing SVM training set")
        for doc in es.scan(search_query=search_query):
            if len(train_data) < total_training_events:
                logging.tick()
                fields = es.extract_fields_from_document(doc)
                train_data.append(fields)
            else:
                # We have collected sufficient training data
                break

        # Now, train the model
        if len(train_data) > 0:
            pass  # Train!!
        else:
            logging.logger.warning(
                "no sentences to train model on. Are you sure the sentence configuration is correctly defined?"
            )
Exemple #8
0
    def test_extract_outlier_asset_information_case_insensitive_value(self):
        from helpers.singletons import settings, es

        # test case for case insensitive asset matching
        orig_doc = copy.deepcopy(doc_with_outlier_test_file)
        fields = es.extract_fields_from_document(orig_doc, extract_derived_fields=False)
        outlier_assets = helpers.utils.extract_outlier_asset_information(fields, settings)
        self.assertIn("ip: 192.168.67.175", outlier_assets)
Exemple #9
0
    def find_sudden_appearance(self, start_slide_win, end_slide_win):
        """
        Find sudden apparition in aggregation defined by self.model_settings["aggregator"] of a term field defined by
        self.model_settings["target"] in events within the time window defined by start_slide_win and en_slide_win
        and create outliers. An event is considered as outlier when a term field appear for the first time after
        the (end_slide_win - self.jump_win)

        :param start_slide_win: start time of the time window
        :param end_slide_win: end time of the time window
        """
        aggregator_buckets = es.scan_first_occur_documents(search_query=self.search_query,
                                                           start_time=start_slide_win,
                                                           end_time=end_slide_win,
                                                           model_settings=self.model_settings)
        # Loop over the aggregations
        for aggregator_bucket in aggregator_buckets:
            target_buckets = aggregator_bucket["target"]["buckets"]
            # Loop over the documents in aggregation
            for doc in target_buckets:
                self.num_event_proc += doc["doc_count"]
                raw_doc = doc["top_doc"]["hits"]["hits"][0]
                fields = es.extract_fields_from_document(raw_doc,
                                                         extract_derived_fields=self.model_settings[
                                                             "use_derived_fields"])
                # convert the event timestamp in the right format
                event_timestamp = dateutil.parser.parse(fields[self.model_settings["timestamp_field"]],
                                                        ignoretz=True)

                if event_timestamp > (end_slide_win - self.jump_win):
                    # retrieve extra information
                    extra_outlier_information = dict()
                    extra_outlier_information["size_time_window"] = str(self.delta_slide_win)
                    extra_outlier_information["start_time_window"] = str(start_slide_win)
                    extra_outlier_information["end_time_window"] = str(end_slide_win)
                    extra_outlier_information["aggregator"] = self.model_settings["aggregator"]
                    extra_outlier_information["aggregator_value"] = aggregator_bucket["key"]
                    extra_outlier_information["target"] = self.model_settings["target"]
                    extra_outlier_information["target_value"] = doc["key"]
                    extra_outlier_information["num_target_value_in_window"] = doc["doc_count"]

                    outlier = self.create_outlier(fields,
                                                  raw_doc,
                                                  extra_outlier_information=extra_outlier_information)
                    self.process_outlier(outlier)

                    summary = "In aggregator '%s: %s', the field(s) '%s: %s' appear(s) " \
                              "suddenly at %s of the time window of size %s." % \
                              (", ".join(self.model_settings["aggregator"]),
                               aggregator_bucket["key"],
                               " ,".join(self.model_settings["target"]),
                               doc["key"],
                               str(event_timestamp),
                               self.delta_slide_win)
                    logging.logger.debug(summary)

        logging.tick(self.num_event_proc)
Exemple #10
0
    def test_extract_outlier_asset_information_simple_matching(self):
        from helpers.singletons import settings, es

        orig_doc = copy.deepcopy(doc_with_outlier_test_file)
        fields = es.extract_fields_from_document(orig_doc, extract_derived_fields=False)

        # test case for simple asset matching
        outlier_assets = helpers.utils.extract_outlier_asset_information(fields, settings)
        self.assertIn("user: dummyuser", outlier_assets)
        self.assertIn("host: DUMMY-PC", outlier_assets)
Exemple #11
0
    def test_extract_outlier_asset_information_list_values(self):
        from helpers.singletons import settings, es

        orig_doc = copy.deepcopy(doc_with_asset_edgecases)
        fields = es.extract_fields_from_document(orig_doc, extract_derived_fields=False)

        # test case for asset fields containing multiple values in an array
        outlier_assets = helpers.utils.extract_outlier_asset_information(fields, settings)
        self.assertIn("user: dummyuser1", outlier_assets)  # test case for array assets
        self.assertIn("user: dummyuser2", outlier_assets)  # test case for array assets
        self.assertEqual(len(outlier_assets), 3)  # blank asset fields, such as the PC name in the JSON file, should
Exemple #12
0
    def evaluate_model(self):
        self.extract_extra_model_settings()

        # Train the model
        if self.model_settings["train_model"]:
            self.train_model()
            return

        w2v_model = word2vec.Word2Vec(name=self.model_name)
        search_query = es.filter_by_query_string(self.model_settings["es_query_filter"])

        if not w2v_model.is_trained():
            logging.logger.warning("model was not trained! Skipping analysis.")
        else:
            # Check if we need to run the test data instead of real data
            if w2v_model.use_test_data:
                logging.print_generic_intro("using test data instead of live data to evaluate model " + self.model_name)
                self.evaluate_test_sentences(w2v_model=w2v_model)
                return

            self.total_events = es.count_documents(search_query=search_query)
            logging.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events)

            logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating word2vec model")

            raw_docs = list()
            eval_sentences = list()

            for doc in es.scan(search_query=search_query):
                logging.tick()
                fields = es.extract_fields_from_document(doc)

                try:
                    new_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["sentence_format"])
                    eval_sentences.extend(new_sentences)
                except KeyError:
                    logging.logger.debug("skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]")
                    continue

                for _ in new_sentences:
                    raw_docs.append(doc)

                # Evaluate batch of events against the model
                if logging.current_step == self.total_events or len(eval_sentences) >= settings.config.getint("machine_learning", "word2vec_batch_eval_size"):
                    logging.logger.info("evaluating batch of " + str(len(eval_sentences)) + " sentences")
                    outliers = self.evaluate_batch_for_outliers(w2v_model=w2v_model, eval_sentences=eval_sentences, raw_docs=raw_docs)

                    if len(outliers) > 0:
                        unique_summaries = len(set(o.outlier_dict["summary"] for o in outliers))
                        logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]")

                    # Reset data structures for next batch
                    raw_docs = list()
                    eval_sentences = list()
Exemple #13
0
 def is_document_whitelisted(self, document, extract_field=True):
     document_to_check = copy.deepcopy(document)
     if extract_field:
         fields = es.extract_fields_from_document(
             document_to_check,
             extract_derived_fields=self.
             model_settings["use_derived_fields"])
     else:
         fields = document
     outlier_param = self._prepare_outlier_parameters(dict(), fields)
     document_to_check['__whitelist_extra'] = outlier_param
     return Outlier.is_whitelisted_doc(document_to_check)
def evaluate_model(model_name=None, model_settings=None):
    lucene_query = es.filter_by_query_string(model_settings["es_query_filter"])
    total_events = es.count_documents(lucene_query=lucene_query)

    logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events)
    logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating beaconing model")

    eval_terms_array = defaultdict()
    total_terms_added = 0

    outlier_batches_trend = 0
    for doc in es.scan(lucene_query=lucene_query):
        logging.tick()
        fields = es.extract_fields_from_document(doc)

        try:
            target_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=model_settings["target"])
            aggregator_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=model_settings["aggregator"])
            will_process_doc = True
        except (KeyError, TypeError):
            logging.logger.debug("Skipping event which does not contain the target and aggregator fields we are processing. - [" + model_name + "]")
            will_process_doc = False

        if will_process_doc:
            observations = dict()

            for target_sentence in target_sentences:
                flattened_target_sentence = helpers.utils.flatten_sentence(target_sentence)

                for aggregator_sentence in aggregator_sentences:
                    flattened_aggregator_sentence = helpers.utils.flatten_sentence(aggregator_sentence)
                    eval_terms_array = add_term_to_batch(eval_terms_array, flattened_aggregator_sentence, flattened_target_sentence, observations, doc)

            total_terms_added += len(target_sentences)

        # Evaluate batch of events against the model
        last_batch = (logging.current_step == total_events)
        if last_batch or total_terms_added >= settings.config.getint("beaconing", "beaconing_batch_eval_size"):
            logging.logger.info("evaluating batch of " + "{:,}".format(total_terms_added) + " terms")
            outliers = evaluate_batch_for_outliers(terms=eval_terms_array, model_settings=model_settings)

            if len(outliers) > 0:
                unique_summaries = len(set(o.get_observation("summary") for o in outliers))
                logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]")
                outlier_batches_trend += 1
            else:
                logging.logger.info("no outliers detected in batch")
                outlier_batches_trend -= 1

            # Reset data structures for next batch
            eval_terms_array = defaultdict()
            total_terms_added = 0
Exemple #15
0
def evaluate_model(model_name=None, model_settings=None):
    lucene_query = es.filter_by_query_string(model_settings["es_query_filter"])
    total_events = es.count_documents(lucene_query=lucene_query)

    logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events)
    logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating simplequery model")

    for doc in es.scan(lucene_query=lucene_query):
        logging.tick()
        fields = es.extract_fields_from_document(doc)

        # Add your model logic here
        logging.logger.info(json.dumps(fields, indent=4))
Exemple #16
0
    def evaluate_model(self):

        model_filter = {
            "bool": {
                "filter": [{
                    "term": {
                        "outliers.model_name.keyword": {
                            "value": self.model_name
                        }
                    }
                }, {
                    "term": {
                        "outliers.model_type.keyword": {
                            "value": "simplequery"
                        }
                    }
                }]
            }
        }

        exclude_hits_filter = {"bool": {"must_not": model_filter}}

        query = self.search_query

        if "filter" in query:
            query["filter"].append(exclude_hits_filter)
        else:
            query["filter"] = [exclude_hits_filter]

        self.total_events, documents = es.count_and_scan_documents(
            index=self.model_settings["es_index"],
            search_query=query,
            model_settings=self.model_settings)
        self.print_analysis_intro(event_type="evaluating " + self.model_type +
                                  "_" + self.model_name,
                                  total_events=self.total_events)

        logging.init_ticker(total_steps=self.total_events,
                            desc=self.model_name + " - evaluating " +
                            self.model_type + " model")
        if self.total_events > 0:
            for doc in documents:
                logging.tick()
                fields = es.extract_fields_from_document(
                    doc,
                    extract_derived_fields=self.
                    model_settings["use_derived_fields"])
                outlier = self.create_outlier(fields, doc)
                self.process_outlier(outlier)

        self.print_analysis_summary()
Exemple #17
0
def perform_analysis():
    for name in settings.config.sections():
        if name.startswith("terms_"):
                param, model_name = name.split("terms_", 1)
                should_test_model = settings.config.getboolean("general", "run_models") and settings.config.getboolean(name, "run_model")
                should_run_model = settings.config.getboolean("general", "test_models") and settings.config.getboolean(name, "test_model")

                if should_test_model or should_run_model:
                    model_settings = extract_model_settings(name)
                    if "*" in model_settings["target"]:
                        original_model_name = model_name

                        logging.logger.warning("running terms model in brute force mode, could take a long time!")

                        lucene_query = es.filter_by_query_string(model_settings["es_query_filter"])
                        batch_size = settings.config.getint("terms", "terms_batch_eval_size")

                        total_events = es.count_documents(lucene_query=lucene_query)
                        logging.init_ticker(total_steps=min(total_events, batch_size), desc=model_name + " - extracting brute force fields")

                        field_names = set()
                        num_docs_processed = 0
                        for doc in es.scan(lucene_query=lucene_query):
                            logging.tick()
                            fields = es.extract_fields_from_document(doc)
                            fields = helpers.utils.flatten_dict(fields)

                            # skip all fields that are related to outliers, we don't want to brute force them
                            for field_name in list(fields.keys()):  # create list instead of iterator so we can mutate the dictionary when iterating
                                if field_name.startswith('outliers.'):
                                    logging.logger.debug("not brute forcing outliers field " + str(field_name))
                                    fields.pop(field_name)

                            field_names.update(fields.keys())

                            if num_docs_processed == batch_size:
                                break
                            else:
                                num_docs_processed += 1

                        logging.logger.info("going to brute force " + str(len(field_names)) + " fields")
                        for field_name in field_names:
                            model_name = original_model_name + " [" + field_name + "]"
                            # only brute force nested fields, so not the top level fields such as timestamp, deployment name, etc.
                            if "." in field_name:
                                model_settings["target"] = list([field_name])
                                model_settings["brute_forced_field"] = field_name  # so it can be added to the outlier events automatically
                                evaluate_model(model_name=model_name, model_settings=model_settings, brute_force=True)

                    else:
                        evaluate_model(model_name=model_name, model_settings=model_settings)
def evaluate_batch_for_outliers(metrics=None, model_settings=None, last_batch=False):
    # Initialize
    outliers = list()
    remaining_metrics = metrics.copy()

    for i, aggregator_value in enumerate(metrics):

        # Check if we have sufficient data. if not, continue. Else, evaluate for outliers.
        if len(metrics[aggregator_value]["metrics"]) < 100 and last_batch is False:
            continue
        else:
            # Remove from remaining metrics, as we will be handling it in a second
            del remaining_metrics[aggregator_value]

        # Calculate the decision frontier
        decision_frontier = helpers.utils.get_decision_frontier(model_settings["trigger_method"], metrics[aggregator_value]["metrics"], model_settings["trigger_sensitivity"], model_settings["trigger_on"])
        logging.logger.debug("using decision frontier " + str(decision_frontier) + " for aggregator " + str(aggregator_value) + " - " + model_settings["metric"])
        logging.logger.debug("example metric from batch for " + metrics[aggregator_value]["observations"][0]["target"] + ": " + str(metrics[aggregator_value]["metrics"][0]))

        # Calculate all outliers in array
        for ii, metric_value in enumerate(metrics[aggregator_value]["metrics"]):
            is_outlier = helpers.utils.is_outlier(metric_value, decision_frontier, model_settings["trigger_on"])

            if is_outlier:
                confidence = np.abs(decision_frontier - metric_value)

                # Extract fields from raw document
                fields = es.extract_fields_from_document(metrics[aggregator_value]["raw_docs"][ii])

                observations = metrics[aggregator_value]["observations"][ii]
                merged_fields_and_observations = helpers.utils.merge_two_dicts(fields, observations)
                outlier_summary = helpers.utils.replace_placeholder_fields_with_values(model_settings["outlier_summary"], merged_fields_and_observations)

                outlier_assets = helpers.utils.extract_outlier_asset_information(fields, settings)
                if len(outlier_assets) > 0:
                    observations["assets"] = outlier_assets

                outlier = Outlier(type=model_settings["outlier_type"], reason=model_settings["outlier_reason"], summary=outlier_summary)

                outlier.add_observation("metric", metric_value)
                outlier.add_observation("decision_frontier", decision_frontier)
                outlier.add_observation("confidence", confidence)

                for k, v in observations.items():
                    outlier.add_observation(k, v)

                outliers.append(outlier)
                es.process_outliers(doc=metrics[aggregator_value]["raw_docs"][ii], outliers=[outlier], should_notify=model_settings["should_notify"])

    return outliers, remaining_metrics
Exemple #19
0
    def evaluate_model(self):
        self.total_events = es.count_documents(search_query=self.search_query)
        logging.print_analysis_intro(event_type="evaluating " +
                                     self.config_section_name,
                                     total_events=self.total_events)

        logging.init_ticker(total_steps=self.total_events,
                            desc=self.model_name + " - evaluating " +
                            self.model_type + " model")
        for doc in es.scan(search_query=self.search_query):
            logging.tick()
            fields = es.extract_fields_from_document(doc)
            self.process_outlier(fields, doc)

        self.print_analysis_summary()
Exemple #20
0
    def _calculate_target_fields_to_brute_force(self):
        batch_size = settings.config.getint("terms", "terms_batch_eval_size")

        self.total_events = es.count_documents(
            index=self.es_index,
            search_query=self.search_query,
            model_settings=self.model_settings)
        logging.init_ticker(total_steps=min(self.total_events, batch_size),
                            desc=self.model_name +
                            " - extracting brute force fields")

        field_names_to_brute_force = set()
        if self.total_events > 0:
            num_docs_processed = 0
            for doc in es.scan(index=self.es_index,
                               search_query=self.search_query,
                               model_settings=self.model_settings):
                logging.tick()
                fields = es.extract_fields_from_document(
                    doc,
                    extract_derived_fields=self.
                    model_settings["use_derived_fields"])
                fields = helpers.utils.flatten_dict(fields)

                # create list instead of iterator so we can mutate the dictionary when iterating
                for field_name in list(fields.keys()):
                    # skip all fields that are related to outliers, we don't want to brute force them
                    if field_name.startswith('outliers.'):
                        logging.logger.debug(
                            "not brute forcing outliers field " +
                            str(field_name))
                        continue

                    # only brute force nested fields, so not the top level fields such as timestamp,
                    # deployment name, etc.
                    if "." in field_name:
                        field_names_to_brute_force.add(field_name)

                # only process a single batch of events in order to decide which fields to brute force
                if num_docs_processed == batch_size:
                    break
                else:
                    num_docs_processed += 1

        logging.logger.info("going to brute force " +
                            str(len(field_names_to_brute_force)) + " fields")
        return field_names_to_brute_force
    def test_flush_bulk_actions_using_one_save_outlier(self):
        doc_with_outlier_with_derived_timestamp = copy.deepcopy(
            doc_with_outlier_with_derived_timestamp_test_file)
        doc_without_outlier = copy.deepcopy(doc_without_outlier_test_file)
        doc_without_outlier["_source"] = es.extract_fields_from_document(
            doc_without_outlier, extract_derived_fields=True)
        self.test_es.add_doc(doc_without_outlier)

        test_outlier = Outlier(outlier_type="dummy type",
                               outlier_reason="dummy reason",
                               outlier_summary="dummy summary",
                               doc=doc_without_outlier)
        test_outlier.outlier_dict["observation"] = "dummy observation"

        es.save_outlier(test_outlier, extract_derived_fields=True)
        result = [elem for elem in es._scan()][0]
        self.assertEqual(result, doc_with_outlier_with_derived_timestamp)
Exemple #22
0
    def _compute_aggregator_and_target_value(self, doc, target):
        """
        Extract target and aggregator sentence from a document

        :param doc: document where data need to be extract
        :param target: target key name
        :return: list of target and list of aggregator
        """
        fields = es.extract_fields_from_document(doc, extract_derived_fields=self.model_settings["use_derived_fields"])
        try:
            target_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=target)
            aggregator_sentences = helpers.utils.flatten_fields_into_sentences(
                fields=fields, sentence_format=self.model_settings["aggregator"])
        except (KeyError, TypeError):
            logging.logger.debug("Skipping event which does not contain the target and aggregator " +
                                 "fields we are processing. - [" + self.model_name + "]")
            return None, None
        return target_sentences, aggregator_sentences
Exemple #23
0
    def _create_outlier(self, non_outlier_values, term_value_count,
                        aggregator_value, term_value, decision_frontier, batch,
                        ii):
        """
        Create outlier with given parameter

        :param non_outlier_values: list of document that aren't outliers
        :param term_value_count: number of term
        :param aggregator_value: aggregator value
        :param term_value: term value
        :param decision_frontier: value of the decision frontier
        :param batch: batch
        :param ii: index of the document linked to this outlier
        :return: the created outlier
        """

        observations = dict()
        if non_outlier_values:
            non_outlier_values_sample = ",".join(
                random.sample(
                    non_outlier_values, 3 if len(non_outlier_values) > 3 else
                    len(non_outlier_values)))
            observations[
                "non_outlier_values_sample"] = non_outlier_values_sample
        else:
            observations["non_outlier_values_sample"] = []

        observations["term_count"] = term_value_count
        observations["aggregator"] = aggregator_value
        observations["term"] = term_value
        observations["decision_frontier"] = decision_frontier
        observations["trigger_method"] = str(
            self.model_settings["trigger_method"])

        calculated_observations = batch[
            observations["aggregator"]]["observations"][ii]
        calculated_observations.update(observations)

        raw_doc = batch[observations["aggregator"]]["raw_docs"][ii]
        fields = es.extract_fields_from_document(
            raw_doc,
            extract_derived_fields=self.model_settings["use_derived_fields"])
        return self.create_outlier(
            fields, raw_doc, extra_outlier_information=calculated_observations)
Exemple #24
0
    def prepare_and_process_outlier(self, decision_frontier, term_value_count,
                                    terms, aggregator_value, term_counter):
        # Extract fields from raw document
        fields = es.extract_fields_from_document(
            terms[aggregator_value]["raw_docs"][term_counter])

        observations = terms[aggregator_value]["observations"][term_counter]

        observations["aggregator"] = aggregator_value
        observations["term"] = terms[aggregator_value]["targets"][term_counter]
        observations["term_count"] = term_value_count
        observations["decision_frontier"] = decision_frontier
        observations["confidence"] = np.abs(decision_frontier -
                                            term_value_count)

        return self.process_outlier(
            fields,
            terms[aggregator_value]["raw_docs"][term_counter],
            extra_outlier_information=observations)
Exemple #25
0
def evaluate_batch_for_outliers(w2v_model=None,
                                eval_sentences=None,
                                raw_docs=None,
                                model_settings=None):
    # Initialize
    outliers = list()

    # all_words_probs: contains an array of arrays. the nested arrays contain the probabilities of a word on that index to have a certain probability, in the context of another word
    sentence_probs = w2v_model.evaluate_sentences(eval_sentences)

    for i, single_sentence_prob in enumerate(sentence_probs):
        # If the probability is nan, it means that the sentence could not be evaluated, and we can't reason about it.
        # This happens for example whenever the sentence is made up entirely of words that aren't known to the trained model.
        if single_sentence_prob is np.nan:
            continue

        unique_probs = list(set(sentence_probs))

        # if is_outlier_cutoff_percentage(single_sentence_prob, cutoff=0.005):
        # if is_outlier_std(single_sentence_prob, unique_probs, model_settings):
        if is_outlier_mad(single_sentence_prob, unique_probs, model_settings):
            outlier_summary = model_settings["outlier_summary"]

            # Extract fields from raw document
            fields = es.extract_fields_from_document(raw_docs[i])
            outlier_summary = replace_placeholder_string_with_fields(
                outlier_summary, fields)

            outlier = Outlier(type=model_settings["outlier_type"],
                              reason=model_settings["outlier_reason"],
                              summary=outlier_summary)
            outlier.add_observation("probability", str(single_sentence_prob))

            outliers.append(outlier)
            es.process_outliers(doc=raw_docs[i],
                                outliers=[outlier],
                                should_notify=model_settings["should_notify"])
        else:
            if w2v_model.use_test_data:
                logging.logger.info("Not an outlier: " +
                                    str(eval_sentences[i]) + " - " +
                                    str(single_sentence_prob))
    return outliers
Exemple #26
0
    def _compute_fields_observation_and_create_outlier(
            self, non_outlier_values, metrics_aggregator_value, ii,
            decision_frontier, metric_value):
        """
        Extract field from document and compute different element that will be placed in the observation

        :param metrics_aggregator_value: value of the metrics aggregator
        :param ii: index of the document that have been detected like outlier
        :param decision_frontier: the value of the decision frontier
        :param metric_value: the metric value
        :return: the created outlier
        """

        observations = metrics_aggregator_value["observations"][ii]

        if non_outlier_values:
            non_outlier_values_sample = ",".join(
                random.sample(
                    non_outlier_values, 3 if len(non_outlier_values) > 3 else
                    len(non_outlier_values)))
            observations[
                "non_outlier_values_sample"] = non_outlier_values_sample
        else:
            observations["non_outlier_values_sample"] = []

        observations["metric"] = metric_value
        observations["decision_frontier"] = decision_frontier

        confidence = np.abs(decision_frontier - metric_value)
        observations["confidence"] = confidence

        # Extract fields from raw document
        fields = es.extract_fields_from_document(
            metrics_aggregator_value["raw_docs"][ii],
            extract_derived_fields=self.model_settings["use_derived_fields"])

        outlier = self.create_outlier(fields,
                                      metrics_aggregator_value["raw_docs"][ii],
                                      extra_outlier_information=observations)
        return outlier
def evaluate_model(model_name=None, model_settings=None):
    lucene_query = es.filter_by_query_string(model_settings["es_query_filter"])
    total_events = es.count_documents(lucene_query=lucene_query)

    logging.print_analysis_intro(event_type="evaluating " + model_name,
                                 total_events=total_events)
    logging.init_ticker(total_steps=total_events,
                        desc=model_name + " - evaluating simplequery model")

    outliers = list()
    for doc in es.scan(lucene_query=lucene_query):
        logging.tick()
        fields = es.extract_fields_from_document(doc)

        outlier_summary = replace_placeholder_string_with_fields(
            model_settings["outlier_summary"], fields)
        outlier_assets = helpers.utils.extract_outlier_asset_information(
            fields, settings)
        outlier = Outlier(type=model_settings["outlier_type"],
                          reason=model_settings["outlier_reason"],
                          summary=outlier_summary)

        if len(outlier_assets) > 0:
            outlier.add_observation("assets", outlier_assets)

        outliers.append(outlier)

        es.process_outliers(doc=doc,
                            outliers=[outlier],
                            should_notify=model_settings["should_notify"])

    if len(outliers) > 0:
        unique_summaries = len(
            set(o.get_observation("summary") for o in outliers))
        logging.logger.info("total outliers in batch processed: " +
                            str(len(outliers)) + " [" + str(unique_summaries) +
                            " unique]")
Exemple #28
0
    def train_model(self):
        train_data = list()

        self.total_events, documents = es.count_and_scan_documents(
            index=self.model_settings["es_index"],
            search_query=self.search_query,
            model_settings=self.model_settings)
        training_data_size_pct = settings.config.getint(
            "machine_learning", "training_data_size_pct")
        training_data_size = self.total_events / 100 * training_data_size_pct

        self.print_analysis_intro(event_type="training " + self.model_name,
                                  total_events=self.total_events)
        total_training_events = int(min(training_data_size, self.total_events))

        logging.init_ticker(total_steps=total_training_events,
                            desc=self.model_name + " - preparing training set")
        if self.total_events > 0:
            for doc in documents:
                if len(train_data) < total_training_events:
                    logging.tick()
                    fields = es.extract_fields_from_document(
                        doc,
                        extract_derived_fields=self.
                        model_settings["use_derived_fields"])
                    train_data.append(fields)
                else:
                    # We have collected sufficient training data
                    break

        # Now, train the model
        if train_data:
            pass  # Train!!
        else:
            logging.logger.warning(
                "no sentences to train model on. Are you sure the sentence configuration is "
                + "correctly defined?")
Exemple #29
0
    def evaluate_target(self, target, search_query, brute_force=False):
        self.total_events = es.count_documents(index=self.es_index,
                                               search_query=search_query)

        logging.print_analysis_intro(event_type="evaluating " +
                                     self.model_name,
                                     total_events=self.total_events)
        logging.init_ticker(total_steps=self.total_events,
                            desc=self.model_name + " - evaluating terms model")

        if brute_force:
            logging.logger.info("brute forcing field %s", str(target[0]))

        eval_terms_array = defaultdict()
        total_terms_added = 0

        outlier_batches_trend = 0
        for doc in es.scan(index=self.es_index, search_query=search_query):
            logging.tick()
            fields = es.extract_fields_from_document(
                doc,
                extract_derived_fields=self.
                model_settings["use_derived_fields"])

            try:
                target_sentences = helpers.utils.flatten_fields_into_sentences(
                    fields=fields, sentence_format=target)
                aggregator_sentences = helpers.utils.flatten_fields_into_sentences(
                    fields=fields,
                    sentence_format=self.model_settings["aggregator"])
                will_process_doc = True
            except (KeyError, TypeError):
                logging.logger.debug(
                    "Skipping event which does not contain the target and aggregator fields we are processing. - ["
                    + self.model_name + "]")
                will_process_doc = False

            if will_process_doc:
                observations = dict()

                if brute_force:
                    observations["brute_forced_field"] = self.model_settings[
                        "brute_forced_field"]

                for target_sentence in target_sentences:
                    flattened_target_sentence = helpers.utils.flatten_sentence(
                        target_sentence)

                    for aggregator_sentence in aggregator_sentences:
                        flattened_aggregator_sentence = helpers.utils.flatten_sentence(
                            aggregator_sentence)
                        eval_terms_array = self.add_term_to_batch(
                            eval_terms_array, flattened_aggregator_sentence,
                            flattened_target_sentence, observations, doc)

                total_terms_added += len(target_sentences)

            # Evaluate batch of events against the model
            last_batch = (logging.current_step == self.total_events)
            if last_batch or total_terms_added >= settings.config.getint(
                    "terms", "terms_batch_eval_size"):
                logging.logger.info("evaluating batch of " +
                                    "{:,}".format(total_terms_added) +
                                    " terms")
                outliers = self.evaluate_batch_for_outliers(
                    terms=eval_terms_array)

                if len(outliers) > 0:
                    unique_summaries = len(
                        set(o.outlier_dict["summary"] for o in outliers))
                    logging.logger.info("total outliers in batch processed: " +
                                        str(len(outliers)) + " [" +
                                        str(unique_summaries) +
                                        " unique summaries]")
                    outlier_batches_trend += 1
                else:
                    logging.logger.info("no outliers detected in batch")
                    outlier_batches_trend -= 1

                if outlier_batches_trend == -3 and brute_force:
                    logging.logger.info(
                        "too many batches without outliers, we are not going to continue brute forcing"
                    )
                    break

                if outlier_batches_trend == 3 and brute_force:
                    logging.logger.info(
                        "too many batches with outliers, we are not going to continue brute forcing"
                    )
                    break

                # Reset data structures for next batch
                eval_terms_array = defaultdict()
                total_terms_added = 0

        self.print_analysis_summary()
Exemple #30
0
    def evaluate_batch_for_outliers(self, terms=None):
        # Initialize
        outliers = list()

        # In case we want to count terms across different aggregators, we need to first iterate over all aggregators
        # and calculate the total number of unique terms for each aggregated value.
        # For example:
        # terms["smsc.exe"][A, B, C, D, D, E]
        # terms["abc.exe"][A, A, B]
        # is converted into:
        # unique_target_counts_across_aggregators: [5, 2] (the first term contains 5 unique values, the second one contains 2)
        if self.model_settings["target_count_method"] == "across_aggregators":
            unique_target_counts_across_aggregators = list()

            # loop 0: {i=0, aggregator_value = "smsc.exe"}, loop 1: {i=1, aggregator_value = "abc.exe"},
            for i, aggregator_value in enumerate(terms):
                # unique_targets_in_aggregated_value = loop 0: [A, B, C, D, E], loop 1: [A, A, B]
                # unique_target_counts_across_aggregators = loop 0: [5], loop 1: [5, 2]
                unique_targets_in_aggregated_value = set(
                    terms[aggregator_value]["targets"])
                unique_target_counts_across_aggregators.append(
                    len(unique_targets_in_aggregated_value))

            # Calculate the decision frontier
            # unique_target_counts_across_aggregators = [5, 2]
            decision_frontier = helpers.utils.get_decision_frontier(
                self.model_settings["trigger_method"],
                unique_target_counts_across_aggregators,
                self.model_settings["trigger_sensitivity"],
                self.model_settings["trigger_on"])
            logging.logger.debug("using " +
                                 self.model_settings["trigger_method"] +
                                 " decision frontier " +
                                 str(decision_frontier) +
                                 " across all aggregators")

            non_outlier_values = set()

            # loop 0: {i=0, aggregator_value = "smsc.exe"}, loop 1: {i=1, aggregator_value = "abc.exe"},
            for i, aggregator_value in enumerate(terms):
                unique_target_count_across_aggregators = unique_target_counts_across_aggregators[
                    i]
                logging.logger.debug(
                    "unique target count for aggregator " +
                    str(aggregator_value) + ": " +
                    str(unique_target_count_across_aggregators) +
                    " - decision frontier " + str(decision_frontier))
                is_outlier = helpers.utils.is_outlier(
                    unique_target_count_across_aggregators, decision_frontier,
                    self.model_settings["trigger_on"])

                if is_outlier:
                    for ii, term_value in enumerate(
                            terms[aggregator_value]["targets"]):
                        non_outlier_values_sample = ",".join(
                            random.sample(non_outlier_values,
                                          min(3, len(non_outlier_values))))

                        observations = dict()
                        observations[
                            "non_outlier_values_sample"] = non_outlier_values_sample
                        observations[
                            "term_count"] = unique_target_count_across_aggregators
                        observations["aggregator"] = aggregator_value
                        observations["term"] = term_value
                        observations["decision_frontier"] = decision_frontier
                        observations["trigger_method"] = str(
                            self.model_settings["trigger_method"])

                        calculated_observations = terms[
                            observations["aggregator"]]["observations"][ii]
                        calculated_observations.update(observations)

                        raw_doc = terms[
                            observations["aggregator"]]["raw_docs"][ii]
                        fields = es.extract_fields_from_document(
                            raw_doc,
                            extract_derived_fields=self.
                            model_settings["use_derived_fields"])
                        outliers.append(
                            self.process_outlier(fields,
                                                 raw_doc,
                                                 extra_outlier_information=
                                                 calculated_observations))
                else:
                    for ii, term_value in enumerate(
                            terms[aggregator_value]["targets"]):
                        non_outlier_values.add(term_value)

        # In case we want to count terms within an aggregator, it's a bit easier.
        # For example:
        # terms["smsc.exe"][A, B, C, D, D, E]
        # terms["abc.exe"][A, A, B]
        # is converted into:
        # First iteration: "smsc.exe" -> counted_target_values: {A: 1, B: 1, C: 1, D: 2, E: 1}
        # For each aggregator, we iterate over all terms within it:
        # term_value_count for a document with term "A" then becomes "1" in the example above.
        # we then flag an outlier if that "1" is an outlier in the array ["1 1 1 2 1"]
        if self.model_settings["target_count_method"] == "within_aggregator":
            for i, aggregator_value in enumerate(terms):
                # Count percentage of each target value occuring
                counted_targets = Counter(terms[aggregator_value]["targets"])
                counted_target_values = list(counted_targets.values())

                logging.logger.debug("terms count for aggregator value " +
                                     aggregator_value + " -> " +
                                     str(counted_targets))
                decision_frontier = helpers.utils.get_decision_frontier(
                    self.model_settings["trigger_method"],
                    counted_target_values,
                    self.model_settings["trigger_sensitivity"],
                    self.model_settings["trigger_on"])

                logging.logger.debug("using " +
                                     self.model_settings["trigger_method"] +
                                     " decision frontier " +
                                     str(decision_frontier) +
                                     " for aggregator " +
                                     str(aggregator_value))

                non_outlier_values = set()
                for ii, term_value in enumerate(
                        terms[aggregator_value]["targets"]):
                    term_value_count = counted_targets[term_value]
                    is_outlier = helpers.utils.is_outlier(
                        term_value_count, decision_frontier,
                        self.model_settings["trigger_on"])

                    if is_outlier:
                        non_outlier_values_sample = ",".join(
                            random.sample(non_outlier_values,
                                          min(3, len(non_outlier_values))))

                        observations = dict()
                        observations[
                            "non_outlier_values_sample"] = non_outlier_values_sample
                        observations["term_count"] = term_value_count
                        observations["aggregator"] = aggregator_value
                        observations["term"] = term_value
                        observations["decision_frontier"] = decision_frontier
                        observations["trigger_method"] = str(
                            self.model_settings["trigger_method"])

                        calculated_observations = terms[
                            observations["aggregator"]]["observations"][ii]
                        calculated_observations.update(observations)

                        raw_doc = terms[
                            observations["aggregator"]]["raw_docs"][ii]
                        fields = es.extract_fields_from_document(
                            raw_doc,
                            extract_derived_fields=self.
                            model_settings["use_derived_fields"])
                        outliers.append(
                            self.process_outlier(fields,
                                                 raw_doc,
                                                 extra_outlier_information=
                                                 calculated_observations))
                    else:
                        non_outlier_values.add(term_value)
        return outliers