Example #1
0
    def train_model(self):
        search_query = es.filter_by_query_string(
            self.model_settings["es_query_filter"])

        train_data = list()

        self.total_events = es.count_documents(search_query=search_query)
        training_data_size_pct = settings.config.getint(
            "machine_learning", "training_data_size_pct")
        training_data_size = self.total_events / 100 * training_data_size_pct

        logging.print_analysis_intro(event_type="training " + self.model_name,
                                     total_events=self.total_events)
        total_training_events = int(min(training_data_size, self.total_events))

        logging.init_ticker(total_steps=total_training_events,
                            desc=self.model_name +
                            " - preparing SVM training set")
        for doc in es.scan(search_query=search_query):
            if len(train_data) < total_training_events:
                logging.tick()
                fields = es.extract_fields_from_document(doc)
                train_data.append(fields)
            else:
                # We have collected sufficient training data
                break

        # Now, train the model
        if len(train_data) > 0:
            pass  # Train!!
        else:
            logging.logger.warning(
                "no sentences to train model on. Are you sure the sentence configuration is correctly defined?"
            )
Example #2
0
    def train_model(self):
        w2v_model = word2vec.Word2Vec(name=self.model_name)
        search_query = es.filter_by_query_string(self.model_settings["es_query_filter"])

        sentences = list()

        self.total_events = es.count_documents(search_query=search_query)
        training_data_size_pct = settings.config.getint("machine_learning", "training_data_size_pct")
        training_data_size = self.total_events / 100 * training_data_size_pct

        logging.print_analysis_intro(event_type="training " + self.model_name, total_events=self.total_events)
        total_training_events = int(min(training_data_size, self.total_events))

        logging.init_ticker(total_steps=total_training_events, desc=self.model_name + " - preparing word2vec training set")
        for doc in es.scan(search_query=search_query):
            if len(sentences) < total_training_events:
                logging.tick()
                fields = es.extract_fields_from_document(doc)
                if set(self.model_settings["sentence_format"]).issubset(fields.keys()):
                    new_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["sentence_format"])
                    for sentence in new_sentences:
                        sentences.append(tuple(sentence))

                    # Remove all duplicates from sentences for training - REMOVED FOR TESTING
                    # sentences = list(sentences)
            else:
                # We have collected sufficient training data
                break

        # Now, train the model
        if len(sentences) > 0:
            w2v_model.train_model(sentences)
        else:
            logging.logger.warning("no sentences to train model on. Are you sure the sentence configuration is correctly defined?")
Example #3
0
    def evaluate_model(self):
        self.total_events, documents = es.count_and_scan_documents(index=self.model_settings["es_index"],
                                                                   search_query=self.search_query,
                                                                   model_settings=self.model_settings)

        self.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events)
        logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating terms model")

        if self.total_events > 0:
            current_batch = defaultdict()
            targets_for_next_batch = defaultdict()
            total_targets_in_batch = 0

            for doc in documents:
                logging.tick()
                target_sentences, aggregator_sentences = self._compute_aggregator_and_target_value(
                    doc, self.model_settings["target"])

                if target_sentences is not None and aggregator_sentences is not None:
                    # Add current document to current_batch
                    current_batch = self._add_document_to_batch(current_batch, target_sentences,
                                                                aggregator_sentences, doc)

                    total_targets_in_batch += len(target_sentences) * len(aggregator_sentences)

                # Evaluate batch of events against the model
                is_last_batch = (logging.current_step == self.total_events)  # Check if it is the last batch
                # Run if it is the last batch OR if the batch size is large enough

                if is_last_batch or total_targets_in_batch >= self.terms_batch_eval_size:

                    # Display log message
                    log_message = "evaluating batch of " + "{:,}".format(total_targets_in_batch) + " terms "
                    if len(targets_for_next_batch) > 0:
                        log_message += "(+ " + "{:,}".format(len(targets_for_next_batch)) + " terms from last batch) "
                    log_message += "[" + "{:,}".format(logging.current_step) + " events processed]"
                    logging.logger.info(log_message)

                    # evaluate the current batch
                    outliers_in_batch, targets_for_next_batch = self._evaluate_batch_for_outliers(batch=current_batch)

                    if outliers_in_batch:
                        unique_summaries_in_batch = len(set(o.outlier_dict["summary"] for o in outliers_in_batch))
                        logging.logger.info("processing " + "{:,}".format(len(outliers_in_batch)) +
                                            " outliers in batch [" + "{:,}".format(unique_summaries_in_batch) +
                                            " unique summaries]")

                        for outlier in outliers_in_batch:
                            self.process_outlier(outlier)

                    else:
                        logging.logger.info("no outliers processed in batch")

                    # Reset data structures for next batch
                    current_batch = targets_for_next_batch
                    total_targets_in_batch = 0

        self.print_analysis_summary()
Example #4
0
    def find_sudden_appearance(self, start_slide_win, end_slide_win):
        """
        Find sudden apparition in aggregation defined by self.model_settings["aggregator"] of a term field defined by
        self.model_settings["target"] in events within the time window defined by start_slide_win and en_slide_win
        and create outliers. An event is considered as outlier when a term field appear for the first time after
        the (end_slide_win - self.jump_win)

        :param start_slide_win: start time of the time window
        :param end_slide_win: end time of the time window
        """
        aggregator_buckets = es.scan_first_occur_documents(search_query=self.search_query,
                                                           start_time=start_slide_win,
                                                           end_time=end_slide_win,
                                                           model_settings=self.model_settings)
        # Loop over the aggregations
        for aggregator_bucket in aggregator_buckets:
            target_buckets = aggregator_bucket["target"]["buckets"]
            # Loop over the documents in aggregation
            for doc in target_buckets:
                self.num_event_proc += doc["doc_count"]
                raw_doc = doc["top_doc"]["hits"]["hits"][0]
                fields = es.extract_fields_from_document(raw_doc,
                                                         extract_derived_fields=self.model_settings[
                                                             "use_derived_fields"])
                # convert the event timestamp in the right format
                event_timestamp = dateutil.parser.parse(fields[self.model_settings["timestamp_field"]],
                                                        ignoretz=True)

                if event_timestamp > (end_slide_win - self.jump_win):
                    # retrieve extra information
                    extra_outlier_information = dict()
                    extra_outlier_information["size_time_window"] = str(self.delta_slide_win)
                    extra_outlier_information["start_time_window"] = str(start_slide_win)
                    extra_outlier_information["end_time_window"] = str(end_slide_win)
                    extra_outlier_information["aggregator"] = self.model_settings["aggregator"]
                    extra_outlier_information["aggregator_value"] = aggregator_bucket["key"]
                    extra_outlier_information["target"] = self.model_settings["target"]
                    extra_outlier_information["target_value"] = doc["key"]
                    extra_outlier_information["num_target_value_in_window"] = doc["doc_count"]

                    outlier = self.create_outlier(fields,
                                                  raw_doc,
                                                  extra_outlier_information=extra_outlier_information)
                    self.process_outlier(outlier)

                    summary = "In aggregator '%s: %s', the field(s) '%s: %s' appear(s) " \
                              "suddenly at %s of the time window of size %s." % \
                              (", ".join(self.model_settings["aggregator"]),
                               aggregator_bucket["key"],
                               " ,".join(self.model_settings["target"]),
                               doc["key"],
                               str(event_timestamp),
                               self.delta_slide_win)
                    logging.logger.debug(summary)

        logging.tick(self.num_event_proc)
Example #5
0
    def evaluate_model(self):
        self.extract_extra_model_settings()

        # Train the model
        if self.model_settings["train_model"]:
            self.train_model()
            return

        w2v_model = word2vec.Word2Vec(name=self.model_name)
        search_query = es.filter_by_query_string(self.model_settings["es_query_filter"])

        if not w2v_model.is_trained():
            logging.logger.warning("model was not trained! Skipping analysis.")
        else:
            # Check if we need to run the test data instead of real data
            if w2v_model.use_test_data:
                logging.print_generic_intro("using test data instead of live data to evaluate model " + self.model_name)
                self.evaluate_test_sentences(w2v_model=w2v_model)
                return

            self.total_events = es.count_documents(search_query=search_query)
            logging.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events)

            logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating word2vec model")

            raw_docs = list()
            eval_sentences = list()

            for doc in es.scan(search_query=search_query):
                logging.tick()
                fields = es.extract_fields_from_document(doc)

                try:
                    new_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["sentence_format"])
                    eval_sentences.extend(new_sentences)
                except KeyError:
                    logging.logger.debug("skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]")
                    continue

                for _ in new_sentences:
                    raw_docs.append(doc)

                # Evaluate batch of events against the model
                if logging.current_step == self.total_events or len(eval_sentences) >= settings.config.getint("machine_learning", "word2vec_batch_eval_size"):
                    logging.logger.info("evaluating batch of " + str(len(eval_sentences)) + " sentences")
                    outliers = self.evaluate_batch_for_outliers(w2v_model=w2v_model, eval_sentences=eval_sentences, raw_docs=raw_docs)

                    if len(outliers) > 0:
                        unique_summaries = len(set(o.outlier_dict["summary"] for o in outliers))
                        logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]")

                    # Reset data structures for next batch
                    raw_docs = list()
                    eval_sentences = list()
def evaluate_model(model_name=None, model_settings=None):
    lucene_query = es.filter_by_query_string(model_settings["es_query_filter"])
    total_events = es.count_documents(lucene_query=lucene_query)

    logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events)
    logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating beaconing model")

    eval_terms_array = defaultdict()
    total_terms_added = 0

    outlier_batches_trend = 0
    for doc in es.scan(lucene_query=lucene_query):
        logging.tick()
        fields = es.extract_fields_from_document(doc)

        try:
            target_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=model_settings["target"])
            aggregator_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=model_settings["aggregator"])
            will_process_doc = True
        except (KeyError, TypeError):
            logging.logger.debug("Skipping event which does not contain the target and aggregator fields we are processing. - [" + model_name + "]")
            will_process_doc = False

        if will_process_doc:
            observations = dict()

            for target_sentence in target_sentences:
                flattened_target_sentence = helpers.utils.flatten_sentence(target_sentence)

                for aggregator_sentence in aggregator_sentences:
                    flattened_aggregator_sentence = helpers.utils.flatten_sentence(aggregator_sentence)
                    eval_terms_array = add_term_to_batch(eval_terms_array, flattened_aggregator_sentence, flattened_target_sentence, observations, doc)

            total_terms_added += len(target_sentences)

        # Evaluate batch of events against the model
        last_batch = (logging.current_step == total_events)
        if last_batch or total_terms_added >= settings.config.getint("beaconing", "beaconing_batch_eval_size"):
            logging.logger.info("evaluating batch of " + "{:,}".format(total_terms_added) + " terms")
            outliers = evaluate_batch_for_outliers(terms=eval_terms_array, model_settings=model_settings)

            if len(outliers) > 0:
                unique_summaries = len(set(o.get_observation("summary") for o in outliers))
                logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]")
                outlier_batches_trend += 1
            else:
                logging.logger.info("no outliers detected in batch")
                outlier_batches_trend -= 1

            # Reset data structures for next batch
            eval_terms_array = defaultdict()
            total_terms_added = 0
Example #7
0
def evaluate_model(model_name=None, model_settings=None):
    lucene_query = es.filter_by_query_string(model_settings["es_query_filter"])
    total_events = es.count_documents(lucene_query=lucene_query)

    logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events)
    logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating simplequery model")

    for doc in es.scan(lucene_query=lucene_query):
        logging.tick()
        fields = es.extract_fields_from_document(doc)

        # Add your model logic here
        logging.logger.info(json.dumps(fields, indent=4))
Example #8
0
def perform_analysis():
    for name in settings.config.sections():
        if name.startswith("terms_"):
                param, model_name = name.split("terms_", 1)
                should_test_model = settings.config.getboolean("general", "run_models") and settings.config.getboolean(name, "run_model")
                should_run_model = settings.config.getboolean("general", "test_models") and settings.config.getboolean(name, "test_model")

                if should_test_model or should_run_model:
                    model_settings = extract_model_settings(name)
                    if "*" in model_settings["target"]:
                        original_model_name = model_name

                        logging.logger.warning("running terms model in brute force mode, could take a long time!")

                        lucene_query = es.filter_by_query_string(model_settings["es_query_filter"])
                        batch_size = settings.config.getint("terms", "terms_batch_eval_size")

                        total_events = es.count_documents(lucene_query=lucene_query)
                        logging.init_ticker(total_steps=min(total_events, batch_size), desc=model_name + " - extracting brute force fields")

                        field_names = set()
                        num_docs_processed = 0
                        for doc in es.scan(lucene_query=lucene_query):
                            logging.tick()
                            fields = es.extract_fields_from_document(doc)
                            fields = helpers.utils.flatten_dict(fields)

                            # skip all fields that are related to outliers, we don't want to brute force them
                            for field_name in list(fields.keys()):  # create list instead of iterator so we can mutate the dictionary when iterating
                                if field_name.startswith('outliers.'):
                                    logging.logger.debug("not brute forcing outliers field " + str(field_name))
                                    fields.pop(field_name)

                            field_names.update(fields.keys())

                            if num_docs_processed == batch_size:
                                break
                            else:
                                num_docs_processed += 1

                        logging.logger.info("going to brute force " + str(len(field_names)) + " fields")
                        for field_name in field_names:
                            model_name = original_model_name + " [" + field_name + "]"
                            # only brute force nested fields, so not the top level fields such as timestamp, deployment name, etc.
                            if "." in field_name:
                                model_settings["target"] = list([field_name])
                                model_settings["brute_forced_field"] = field_name  # so it can be added to the outlier events automatically
                                evaluate_model(model_name=model_name, model_settings=model_settings, brute_force=True)

                    else:
                        evaluate_model(model_name=model_name, model_settings=model_settings)
Example #9
0
    def evaluate_model(self):

        model_filter = {
            "bool": {
                "filter": [{
                    "term": {
                        "outliers.model_name.keyword": {
                            "value": self.model_name
                        }
                    }
                }, {
                    "term": {
                        "outliers.model_type.keyword": {
                            "value": "simplequery"
                        }
                    }
                }]
            }
        }

        exclude_hits_filter = {"bool": {"must_not": model_filter}}

        query = self.search_query

        if "filter" in query:
            query["filter"].append(exclude_hits_filter)
        else:
            query["filter"] = [exclude_hits_filter]

        self.total_events, documents = es.count_and_scan_documents(
            index=self.model_settings["es_index"],
            search_query=query,
            model_settings=self.model_settings)
        self.print_analysis_intro(event_type="evaluating " + self.model_type +
                                  "_" + self.model_name,
                                  total_events=self.total_events)

        logging.init_ticker(total_steps=self.total_events,
                            desc=self.model_name + " - evaluating " +
                            self.model_type + " model")
        if self.total_events > 0:
            for doc in documents:
                logging.tick()
                fields = es.extract_fields_from_document(
                    doc,
                    extract_derived_fields=self.
                    model_settings["use_derived_fields"])
                outlier = self.create_outlier(fields, doc)
                self.process_outlier(outlier)

        self.print_analysis_summary()
Example #10
0
    def evaluate_model(self):
        self.total_events = es.count_documents(search_query=self.search_query)
        logging.print_analysis_intro(event_type="evaluating " +
                                     self.config_section_name,
                                     total_events=self.total_events)

        logging.init_ticker(total_steps=self.total_events,
                            desc=self.model_name + " - evaluating " +
                            self.model_type + " model")
        for doc in es.scan(search_query=self.search_query):
            logging.tick()
            fields = es.extract_fields_from_document(doc)
            self.process_outlier(fields, doc)

        self.print_analysis_summary()
Example #11
0
    def _calculate_target_fields_to_brute_force(self):
        batch_size = settings.config.getint("terms", "terms_batch_eval_size")

        self.total_events = es.count_documents(
            index=self.es_index,
            search_query=self.search_query,
            model_settings=self.model_settings)
        logging.init_ticker(total_steps=min(self.total_events, batch_size),
                            desc=self.model_name +
                            " - extracting brute force fields")

        field_names_to_brute_force = set()
        if self.total_events > 0:
            num_docs_processed = 0
            for doc in es.scan(index=self.es_index,
                               search_query=self.search_query,
                               model_settings=self.model_settings):
                logging.tick()
                fields = es.extract_fields_from_document(
                    doc,
                    extract_derived_fields=self.
                    model_settings["use_derived_fields"])
                fields = helpers.utils.flatten_dict(fields)

                # create list instead of iterator so we can mutate the dictionary when iterating
                for field_name in list(fields.keys()):
                    # skip all fields that are related to outliers, we don't want to brute force them
                    if field_name.startswith('outliers.'):
                        logging.logger.debug(
                            "not brute forcing outliers field " +
                            str(field_name))
                        continue

                    # only brute force nested fields, so not the top level fields such as timestamp,
                    # deployment name, etc.
                    if "." in field_name:
                        field_names_to_brute_force.add(field_name)

                # only process a single batch of events in order to decide which fields to brute force
                if num_docs_processed == batch_size:
                    break
                else:
                    num_docs_processed += 1

        logging.logger.info("going to brute force " +
                            str(len(field_names_to_brute_force)) + " fields")
        return field_names_to_brute_force
def evaluate_model(model_name=None, model_settings=None):
    lucene_query = es.filter_by_query_string(model_settings["es_query_filter"])
    total_events = es.count_documents(lucene_query=lucene_query)

    logging.print_analysis_intro(event_type="evaluating " + model_name,
                                 total_events=total_events)
    logging.init_ticker(total_steps=total_events,
                        desc=model_name + " - evaluating simplequery model")

    outliers = list()
    for doc in es.scan(lucene_query=lucene_query):
        logging.tick()
        fields = es.extract_fields_from_document(doc)

        outlier_summary = replace_placeholder_string_with_fields(
            model_settings["outlier_summary"], fields)
        outlier_assets = helpers.utils.extract_outlier_asset_information(
            fields, settings)
        outlier = Outlier(type=model_settings["outlier_type"],
                          reason=model_settings["outlier_reason"],
                          summary=outlier_summary)

        if len(outlier_assets) > 0:
            outlier.add_observation("assets", outlier_assets)

        outliers.append(outlier)

        es.process_outliers(doc=doc,
                            outliers=[outlier],
                            should_notify=model_settings["should_notify"])

    if len(outliers) > 0:
        unique_summaries = len(
            set(o.get_observation("summary") for o in outliers))
        logging.logger.info("total outliers in batch processed: " +
                            str(len(outliers)) + " [" + str(unique_summaries) +
                            " unique]")
Example #13
0
    def train_model(self):
        train_data = list()

        self.total_events, documents = es.count_and_scan_documents(
            index=self.model_settings["es_index"],
            search_query=self.search_query,
            model_settings=self.model_settings)
        training_data_size_pct = settings.config.getint(
            "machine_learning", "training_data_size_pct")
        training_data_size = self.total_events / 100 * training_data_size_pct

        self.print_analysis_intro(event_type="training " + self.model_name,
                                  total_events=self.total_events)
        total_training_events = int(min(training_data_size, self.total_events))

        logging.init_ticker(total_steps=total_training_events,
                            desc=self.model_name + " - preparing training set")
        if self.total_events > 0:
            for doc in documents:
                if len(train_data) < total_training_events:
                    logging.tick()
                    fields = es.extract_fields_from_document(
                        doc,
                        extract_derived_fields=self.
                        model_settings["use_derived_fields"])
                    train_data.append(fields)
                else:
                    # We have collected sufficient training data
                    break

        # Now, train the model
        if train_data:
            pass  # Train!!
        else:
            logging.logger.warning(
                "no sentences to train model on. Are you sure the sentence configuration is "
                + "correctly defined?")
Example #14
0
    def train_model(self, sentences):
        sentences, words_to_indices, indices_to_words, words = flatten_and_build_indices(
            sentences)

        # Global position within sentences array
        sentence_index = 0
        vocabulary_size = len(
            set(words))  # Number of unique words in our vocabulary

        logging.logger.debug("number of training sentences: " +
                             str(len(sentences)))
        logging.logger.debug("words: " + str(len(words)))
        logging.logger.debug("vocabulary size: " + str(vocabulary_size))

        graph = tf.Graph()

        with graph.as_default():
            with tf.name_scope('inputs'):
                # Placeholders are structures for feeding input values
                train_inputs = tf.placeholder(tf.int32,
                                              shape=[self.batch_size])
                train_labels = tf.placeholder(tf.int32,
                                              shape=[self.batch_size, 1])

            # Ops and variables pinned to the CPU because of missing GPU implementation
            with tf.device('/cpu:0'):
                # Define embedding matrix variable
                # Variables are the parameters of the model that are being optimized
                with tf.name_scope('embeddings'):
                    embeddings = tf.Variable(tf.random_uniform(
                        [vocabulary_size, self.embedding_size], -1.0, 1.0),
                                             name="embeddings")
                    # Take an input vector of integer indices,
                    # and “look up” these indices in the supplied embeddings tensor.
                    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

                # Construct the variables for the NCE loss
                with tf.name_scope('weights'):
                    nce_weights = tf.Variable(tf.truncated_normal(
                        [vocabulary_size, self.embedding_size],
                        stddev=1.0 / math.sqrt(self.embedding_size)),
                                              name="weights")
                with tf.name_scope('biases'):
                    nce_biases = tf.Variable(tf.zeros([vocabulary_size]),
                                             name="biases")

            # Compute the average NCE loss for the batch.
            # tf.nce_loss automatically draws a new sample of the negative labels each
            # time we evaluate the loss.
            with tf.name_scope('loss'):
                loss = tf.reduce_mean(
                    tf.nn.nce_loss(weights=nce_weights,
                                   biases=nce_biases,
                                   labels=train_labels,
                                   inputs=embed,
                                   num_sampled=self.num_sampled,
                                   num_classes=vocabulary_size))

            # Add the loss value as a scalar to summary.
            tf.summary.scalar('loss', loss)

            # Construct the SGD optimizer using a learning rate of 1.0.
            with tf.name_scope('optimizer'):
                optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(
                    loss)

            # Compute the cosine similarity between minibatch examples and all embeddings.
            norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings),
                                         1,
                                         keepdims=True),
                           name="norm")
            # normalized_embeddings = embeddings / norm

            # Merge all summaries.
            merged = tf.summary.merge_all()

            # Add variable initializer.
            init = tf.global_variables_initializer()

            # Add saver
            # Save only latest model
            saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True)

        # BEGIN TRAINING
        logging.logger.info("training word2vec model using " +
                            str(len(sentences)) + " samples")
        logging.init_ticker(total_steps=self.num_steps,
                            desc=self.model_name +
                            " - training word2vec model")

        with tf.Session(graph=graph) as session:
            # Open a writer to write summaries.
            writer = tf.summary.FileWriter(self.log_dir, session.graph)

            # We must initialize all variables before we use them.
            init.run()
            logging.logger.debug('Initialized all variables')
            logging.logger.debug(norm)

            average_loss = 0
            average_historical_loss = list()

            step = 0
            while step < self.num_steps:
                logging.tick()

                batch_inputs, batch_labels, sentence_index = generate_batch(
                    self.batch_size, self.skip_window, sentences,
                    sentence_index)
                feed_dict = {
                    train_inputs: batch_inputs,
                    train_labels: batch_labels
                }

                # Define metadata variable.
                run_metadata = tf.RunMetadata()

                # We perform one update step by evaluating the optimizer op (including it in the list of returned values for session.run())
                # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
                # Feed metadata variable to session for visualizing the graph in TensorBoard.
                _, summary, loss_val = session.run([optimizer, merged, loss],
                                                   feed_dict=feed_dict,
                                                   run_metadata=run_metadata)
                average_loss += loss_val

                # Add returned summaries to writer in each step.
                writer.add_summary(summary, step)
                # Add metadata to visualize the graph for the last run.
                if step == (self.num_steps - 1):
                    writer.add_run_metadata(run_metadata, 'step%d' % step)

                if step % 1000 == 0:
                    if step > 0:
                        average_loss /= 1000
                        average_historical_loss.append(average_loss)
                    # The average loss is an estimate of the loss over the last 1000 steps.
                    logging.logger.info('average loss at step ' + str(step) +
                                        ': ' + str(average_loss))
                    average_loss = 0

                    # Check if historical loss is showing signs of improvement
                    if len(average_historical_loss) >= 10:
                        if np.std(average_historical_loss[-10:]) < 1:
                            logging.logger.info(
                                "loss seems to have stabilized, stopping training process"
                            )
                            step = self.num_steps - 1

                if step % self.save_every == 0:
                    saver.save(session, self.meta_graph_dir)

                step = step + 1

            # Save used embeddings together with the model
            with open(self.words_to_indices_filename, "w") as f:
                json.dump(words_to_indices, f)

            with open(self.indices_to_words_filename, "w") as f:
                json.dump(indices_to_words, f)

            # Write corresponding labels for the embeddings.
            with open(self.metadata_filename, 'w') as f:
                for i in range(vocabulary_size):
                    f.write(indices_to_words[i] + '\n')

            # Create a configuration for visualizing embeddings with the labels in TensorBoard.
            config = projector.ProjectorConfig()
            embedding_conf = config.embeddings.add()
            embedding_conf.tensor_name = embeddings.name
            embedding_conf.metadata_path = self.metadata_filename
            projector.visualize_embeddings(writer, config)

            writer.close()
Example #15
0
    def evaluate_target(self, target, search_query, brute_force=False):
        self.total_events = es.count_documents(index=self.es_index,
                                               search_query=search_query)

        logging.print_analysis_intro(event_type="evaluating " +
                                     self.model_name,
                                     total_events=self.total_events)
        logging.init_ticker(total_steps=self.total_events,
                            desc=self.model_name + " - evaluating terms model")

        if brute_force:
            logging.logger.info("brute forcing field %s", str(target[0]))

        eval_terms_array = defaultdict()
        total_terms_added = 0

        outlier_batches_trend = 0
        for doc in es.scan(index=self.es_index, search_query=search_query):
            logging.tick()
            fields = es.extract_fields_from_document(
                doc,
                extract_derived_fields=self.
                model_settings["use_derived_fields"])

            try:
                target_sentences = helpers.utils.flatten_fields_into_sentences(
                    fields=fields, sentence_format=target)
                aggregator_sentences = helpers.utils.flatten_fields_into_sentences(
                    fields=fields,
                    sentence_format=self.model_settings["aggregator"])
                will_process_doc = True
            except (KeyError, TypeError):
                logging.logger.debug(
                    "Skipping event which does not contain the target and aggregator fields we are processing. - ["
                    + self.model_name + "]")
                will_process_doc = False

            if will_process_doc:
                observations = dict()

                if brute_force:
                    observations["brute_forced_field"] = self.model_settings[
                        "brute_forced_field"]

                for target_sentence in target_sentences:
                    flattened_target_sentence = helpers.utils.flatten_sentence(
                        target_sentence)

                    for aggregator_sentence in aggregator_sentences:
                        flattened_aggregator_sentence = helpers.utils.flatten_sentence(
                            aggregator_sentence)
                        eval_terms_array = self.add_term_to_batch(
                            eval_terms_array, flattened_aggregator_sentence,
                            flattened_target_sentence, observations, doc)

                total_terms_added += len(target_sentences)

            # Evaluate batch of events against the model
            last_batch = (logging.current_step == self.total_events)
            if last_batch or total_terms_added >= settings.config.getint(
                    "terms", "terms_batch_eval_size"):
                logging.logger.info("evaluating batch of " +
                                    "{:,}".format(total_terms_added) +
                                    " terms")
                outliers = self.evaluate_batch_for_outliers(
                    terms=eval_terms_array)

                if len(outliers) > 0:
                    unique_summaries = len(
                        set(o.outlier_dict["summary"] for o in outliers))
                    logging.logger.info("total outliers in batch processed: " +
                                        str(len(outliers)) + " [" +
                                        str(unique_summaries) +
                                        " unique summaries]")
                    outlier_batches_trend += 1
                else:
                    logging.logger.info("no outliers detected in batch")
                    outlier_batches_trend -= 1

                if outlier_batches_trend == -3 and brute_force:
                    logging.logger.info(
                        "too many batches without outliers, we are not going to continue brute forcing"
                    )
                    break

                if outlier_batches_trend == 3 and brute_force:
                    logging.logger.info(
                        "too many batches with outliers, we are not going to continue brute forcing"
                    )
                    break

                # Reset data structures for next batch
                eval_terms_array = defaultdict()
                total_terms_added = 0

        self.print_analysis_summary()
Example #16
0
    def evaluate_model(self):
        batch = defaultdict()  # Contain the current batch information
        remaining_metrics = defaultdict()
        total_metrics_in_batch = 0

        self.total_events, documents = es.count_and_scan_documents(
            index=self.model_settings["es_index"],
            search_query=self.search_query,
            model_settings=self.model_settings)

        self.print_analysis_intro(event_type="evaluating " + self.model_name,
                                  total_events=self.total_events)

        logging.init_ticker(total_steps=self.total_events,
                            desc=self.model_name + " - evaluating " +
                            self.model_type + " model")
        if self.total_events > 0:
            for doc in documents:
                logging.tick()

                # Extract target and aggregator values
                target_value, aggregator_sentences = self._compute_aggregator_and_target_value(
                    doc)

                # If target and aggregator values exist
                if target_value is not None and aggregator_sentences is not None:
                    # Add current document to eval_metrics
                    batch, metric_added = self._add_document_to_batch(
                        doc, batch, target_value, aggregator_sentences)

                    # We can only have 1 target field for metrics (as opposed to terms), so the total number of targets
                    # added is the same as the total number of aggregator sentences that were processed for this
                    # document
                    if metric_added:
                        total_metrics_in_batch += len(aggregator_sentences)

                is_last_batch = (logging.current_step == self.total_events
                                 )  # Check if it is the last batch
                # Run if it is the last batch OR if the batch size is large enough
                if is_last_batch or total_metrics_in_batch >= self.metrics_batch_eval_size:

                    # Display log message
                    log_message = "evaluating batch of " + "{:,}".format(
                        total_metrics_in_batch) + " metrics "
                    if remaining_metrics:
                        log_message += "(+ " + "{:,}".format(
                            len(remaining_metrics)
                        ) + " metrics from last batch) "
                    log_message += "[" + "{:,}".format(
                        logging.current_step) + " events processed]"
                    logging.logger.info(log_message)

                    outliers_in_batch, remaining_metrics = self._evaluate_batch_for_outliers(
                        batch=batch, is_last_batch=is_last_batch)

                    # For each result, save it in batch and in ES
                    if outliers_in_batch:
                        unique_summaries_in_batch = len(
                            set(o.outlier_dict["summary"]
                                for o in outliers_in_batch))
                        logging.logger.info(
                            "processing " +
                            "{:,}".format(len(outliers_in_batch)) +
                            " outliers in batch [" +
                            "{:,}".format(unique_summaries_in_batch) +
                            " unique summaries]")

                        for outlier in outliers_in_batch:
                            self.process_outlier(outlier)
                    else:
                        logging.logger.info("no outliers detected in batch")

                    # Reset data structures for next batch
                    batch = remaining_metrics
                    total_metrics_in_batch = 0

        self.print_analysis_summary()
Example #17
0
    def evaluate_model(self):
        self.extract_additional_model_settings()

        eval_metrics = defaultdict()
        total_metrics_added = 0

        self.total_events = es.count_documents(search_query=self.search_query)
        logging.print_analysis_intro(event_type="evaluating " +
                                     self.config_section_name,
                                     total_events=self.total_events)

        logging.init_ticker(total_steps=self.total_events,
                            desc=self.model_name + " - evaluating " +
                            self.model_type + " model")
        for doc in es.scan(search_query=self.search_query):
            logging.tick()

            fields = es.extract_fields_from_document(
                doc,
                extract_derived_fields=self.
                model_settings["use_derived_fields"])

            try:
                target_value = helpers.utils.flatten_sentence(
                    helpers.utils.get_dotkey_value(
                        fields,
                        self.model_settings["target"],
                        case_sensitive=True))
                aggregator_sentences = helpers.utils.flatten_fields_into_sentences(
                    fields=fields,
                    sentence_format=self.model_settings["aggregator"])
            except (KeyError, TypeError):
                logging.logger.debug(
                    "skipping event which does not contain the target and aggregator fields we are processing. - ["
                    + self.model_name + "]")
                continue

            metric, observations = self.calculate_metric(
                self.model_settings["metric"], target_value)

            if metric is not None:  # explicitly check for none, since "0" can be OK as a metric!
                total_metrics_added += 1
                for aggregator_sentence in aggregator_sentences:
                    flattened_aggregator_sentence = helpers.utils.flatten_sentence(
                        aggregator_sentence)
                    eval_metrics = self.add_metric_to_batch(
                        eval_metrics, flattened_aggregator_sentence,
                        target_value, metric, observations, doc)

            # Evaluate batch of events against the model
            last_batch = (logging.current_step == self.total_events)
            if last_batch or total_metrics_added >= settings.config.getint(
                    "metrics", "metrics_batch_eval_size"):
                logging.logger.info("evaluating batch of " +
                                    "{:,}".format(total_metrics_added) +
                                    " metrics [" +
                                    "{:,}".format(logging.current_step) +
                                    " events processed]")
                outliers, remaining_metrics = self.evaluate_batch_for_outliers(
                    metrics=eval_metrics,
                    model_settings=self.model_settings,
                    last_batch=last_batch)

                if len(outliers) > 0:
                    unique_summaries = len(
                        set(o.outlier_dict["summary"] for o in outliers))
                    logging.logger.info("total outliers in batch processed: " +
                                        str(len(outliers)) + " [" +
                                        str(unique_summaries) +
                                        " unique summaries]")
                else:
                    logging.logger.info("no outliers detected in batch")

                # Reset data structures for next batch
                eval_metrics = remaining_metrics.copy()
                total_metrics_added = 0

        self.print_analysis_summary()
Example #18
0
    def evaluate_model(self):
        self.extract_additional_model_settings()

        if "*" in self.model_settings["target"]:
            brute_force = False

            logging.logger.warning("running terms model in brute force mode, could take a long time!")

            search_query = es.filter_by_query_string(self.model_settings["es_query_filter"])
            batch_size = settings.config.getint("terms", "terms_batch_eval_size")

            self.total_events = es.count_documents(search_query=search_query)
            logging.init_ticker(total_steps=min(self.total_events, batch_size), desc=self.model_name + " - extracting brute force fields")

            field_names = set()
            num_docs_processed = 0
            for doc in es.scan(search_query=search_query):
                logging.tick()
                fields = es.extract_fields_from_document(doc)
                fields = helpers.utils.flatten_dict(fields)

                # skip all fields that are related to outliers, we don't want to brute force them
                for field_name in list(fields.keys()):  # create list instead of iterator so we can mutate the dictionary when iterating
                    if field_name.startswith('outliers.'):
                        logging.logger.debug("not brute forcing outliers field " + str(field_name))
                        fields.pop(field_name)

                field_names.update(fields.keys())

                # only process a single batch of events in order to decide which fields to brute force
                if num_docs_processed == batch_size:
                    break
                else:
                    num_docs_processed += 1

            logging.logger.info("going to brute force " + str(len(field_names)) + " fields")
            for field_name in field_names:
                # only brute force nested fields, so not the top level fields such as timestamp, deployment name, etc.
                if "." in field_name:
                    self.model_settings["target"] = list([field_name])
                    self.model_settings["brute_forced_field"] = field_name  # so it can be added to the outlier events automatically
                    brute_force = True

        else:
            brute_force = False

        if brute_force:
            search_query = es.filter_by_query_string(self.model_settings["es_query_filter"] + " AND _exists_:" + self.model_settings["brute_forced_field"])
        else:
            search_query = es.filter_by_query_string(self.model_settings["es_query_filter"])

        self.total_events = es.count_documents(search_query=search_query)

        logging.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events)
        logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating terms model")

        eval_terms_array = defaultdict()
        total_terms_added = 0

        outlier_batches_trend = 0
        for doc in es.scan(search_query=search_query):
            logging.tick()
            fields = es.extract_fields_from_document(doc)

            try:
                target_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["target"])
                aggregator_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["aggregator"])
                will_process_doc = True
            except (KeyError, TypeError):
                logging.logger.debug("Skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]")
                will_process_doc = False

            if will_process_doc:
                observations = dict()

                if brute_force:
                    observations["brute_forced_field"] = self.model_settings["brute_forced_field"]

                for target_sentence in target_sentences:
                    flattened_target_sentence = helpers.utils.flatten_sentence(target_sentence)

                    for aggregator_sentence in aggregator_sentences:
                        flattened_aggregator_sentence = helpers.utils.flatten_sentence(aggregator_sentence)
                        eval_terms_array = self.add_term_to_batch(eval_terms_array, flattened_aggregator_sentence, flattened_target_sentence, observations, doc)

                total_terms_added += len(target_sentences)

            # Evaluate batch of events against the model
            last_batch = (logging.current_step == self.total_events)
            if last_batch or total_terms_added >= settings.config.getint("terms", "terms_batch_eval_size"):
                logging.logger.info("evaluating batch of " + "{:,}".format(total_terms_added) + " terms")
                outliers = self.evaluate_batch_for_outliers(terms=eval_terms_array)

                if len(outliers) > 0:
                    unique_summaries = len(set(o.outlier_dict["summary"] for o in outliers))
                    logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]")
                    outlier_batches_trend += 1
                else:
                    logging.logger.info("no outliers detected in batch")
                    outlier_batches_trend -= 1

                if outlier_batches_trend == -3 and brute_force:
                    logging.logger.info("too many batches without outliers, we are not going to continue brute forcing")
                    break

                if outlier_batches_trend == 3 and brute_force:
                    logging.logger.info("too many batches with outliers, we are not going to continue brute forcing")
                    break

                # Reset data structures for next batch
                eval_terms_array = defaultdict()
                total_terms_added = 0

        self.print_analysis_summary()
def evaluate_model(model_name=None, model_settings=None):
    lucene_query = es.filter_by_query_string(model_settings["es_query_filter"])
    total_events = es.count_documents(lucene_query=lucene_query)

    logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events)
    logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating metrics model")

    eval_metrics = defaultdict()
    total_metrics_added = 0

    for doc in es.scan(lucene_query=lucene_query):
        logging.tick()
        fields = es.extract_fields_from_document(doc)

        will_process_doc = False

        try:
            target_value = helpers.utils.flatten_sentence(helpers.utils.get_dotkey_value(fields, model_settings["target"], case_sensitive=True))
            aggregator_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=model_settings["aggregator"])
            will_process_doc = True
        except (KeyError, TypeError):
            logging.logger.debug("skipping event which does not contain the target and aggregator fields we are processing. - [" + model_name + "]")

        if will_process_doc:
            observations = dict()
            metric = None

            # ------------------------------------
            # METRIC: Calculate numerical value
            # ------------------------------------
            # Example: numerical_value("2") => 2
            if model_settings["metric"] == "numerical_value":
                try:
                    metric = float(target_value)
                    total_metrics_added = total_metrics_added + 1
                except ValueError:
                    # number can not be casted to a Float, just continue
                    pass
            # ------------------------------------
            # METRIC: Calculate length of a string
            # ------------------------------------
            # Example: length("outliers") => 8
            if model_settings["metric"] == "length":
                metric = len(target_value)
                total_metrics_added = total_metrics_added + 1

            # -------------------------------------
            # METRIC: Calculate entropy of a string
            # -------------------------------------
            # Example: entropy("houston") => 2.5216406363433186
            if model_settings["metric"] == "entropy":
                metric = helpers.utils.shannon_entropy(target_value)
                total_metrics_added = total_metrics_added + 1

            # ------------------------------------------------------------------------------------
            # METRIC: Calculate total length of hexadecimal encoded substrings embedded in string
            # ------------------------------------------------------------------------------------
            if model_settings["metric"] == "hex_encoded_length":
                hex_encoded_words = list()
                target_value_words = re.split("[^a-fA-F0-9+]", str(target_value))  # at least length 10 to have 5 encoded characters

                for word in target_value_words:
                    if len(word) > 10 and helpers.utils.is_hex_encoded(word):  # let's match at least 5 characters, meaning 10 hex digits
                        hex_encoded_words.append(word)

                if len(hex_encoded_words) > 0:
                    sorted_hex_encoded_words = sorted(hex_encoded_words, key=len)
                    observations["max_hex_encoded_length"] = len(sorted_hex_encoded_words[-1])
                    observations["max_hex_encoded_word"] = sorted_hex_encoded_words[-1]

                    metric = len(sorted_hex_encoded_words[-1])
                else:
                    metric = 0

                total_metrics_added = total_metrics_added + 1

            # ------------------------------------------------------------------------------------
            # METRIC: Calculate total length of base64 encoded substrings embedded in string
            # ------------------------------------------------------------------------------------
            # Example: base64_encoded_length("houston we have a cHJvYmxlbQ==") => base64_decoded_string: problem, base64_encoded_length: 7
            if model_settings["metric"] == "base64_encoded_length":
                base64_decoded_words = list()

                # Split all non-Base64 characters, so we can try to convert them to Base64 decoded strings
                target_value_words = re.split("[^A-Za-z0-9+/=]", str(target_value))

                for word in target_value_words:
                    decoded_word = helpers.utils.is_base64_encoded(word)
                    if decoded_word and len(decoded_word) >= 5:  # let's match at least 5 characters, meaning 10 base64 digits
                        base64_decoded_words.append(decoded_word)

                if len(base64_decoded_words) > 0:
                    sorted_base64_decoded_words = sorted(base64_decoded_words, key=len)
                    observations["max_base64_decoded_length"] = len(sorted_base64_decoded_words[-1])
                    observations["max_base64_decoded_word"] = sorted_base64_decoded_words[-1]

                    metric = len(sorted_base64_decoded_words[-1])
                else:
                    metric = 0

                total_metrics_added = total_metrics_added + 1

            # ---------------------------------------------------------
            # METRIC: Calculate total length of URLs embedded in string
            # ---------------------------------------------------------
            # Example: url_length("why don't we go http://www.dance.com") => extracted_urls_length: 20, extracted_urls: http://www.dance.com
            if model_settings["metric"] == "url_length":
                extracted_urls_length = 0
                extracted_urls = []

                # if the target value is a list of strings, convert it into a single list of strings
                target_value_words = target_value.replace('"', ' ').split()  # splits on whitespace by default, and on quotes, since we most likely will apply this to parameter arguments

                for word in target_value_words:
                    is_url = helpers.utils.is_url(word)
                    if is_url:
                        extracted_urls_length += len(word)
                        extracted_urls.append(word)

                if extracted_urls_length > 0:
                    observations["extracted_urls_length"] = extracted_urls_length
                    observations["extracted_urls"] = ','.join(extracted_urls)

                metric = extracted_urls_length
                total_metrics_added = total_metrics_added + 1

            if metric is not None:  # explicitly check for none, since "0" can be OK as a metric!
                for aggregator_sentence in aggregator_sentences:
                    flattened_aggregator_sentence = helpers.utils.flatten_sentence(aggregator_sentence)
                    eval_metrics = add_metric_to_batch(eval_metrics, flattened_aggregator_sentence, target_value, metric, observations, doc)

        # Evaluate batch of events against the model
        last_batch = (logging.current_step == total_events)
        if last_batch or total_metrics_added >= settings.config.getint("metrics", "metrics_batch_eval_size"):
            logging.logger.info("evaluating batch of " + "{:,}".format(total_metrics_added) + " metrics")
            outliers, remaining_metrics = evaluate_batch_for_outliers(metrics=eval_metrics, model_settings=model_settings, last_batch=last_batch)

            if len(outliers) > 0:
                unique_summaries = len(set(o.get_observation("summary") for o in outliers))
                logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]")
            else:
                logging.logger.info("no outliers detected in batch")

            # Reset data structures for next batch
            eval_metrics = remaining_metrics.copy()
            total_metrics_added = 0