def test_evaluate_batch_for_outliers_limit_target_buckets_two_doc_max_two(
            self):
        self.test_settings.change_configuration_path(
            "/app/tests/unit_tests/files/terms_test_01.conf")
        analyzer = TermsAnalyzer("terms_dummy_test")

        # Create one document with one aggregator
        aggregator_value = LIST_AGGREGATOR_VALUE[0]
        target_value = random.choice(LIST_TARGET_VALUE)
        observations = {}
        doc = copy.deepcopy(random.choice(LIST_DOC))
        eval_terms_array = analyzer.add_term_to_batch(defaultdict(),
                                                      aggregator_value,
                                                      target_value,
                                                      observations, doc)
        # Create a second document with another aggregator
        aggregator_value2 = LIST_AGGREGATOR_VALUE[1]
        target_value2 = random.choice(LIST_TARGET_VALUE)
        observations2 = {}
        doc2 = copy.deepcopy(random.choice(LIST_DOC))
        eval_terms_array = analyzer.add_term_to_batch(eval_terms_array,
                                                      aggregator_value2,
                                                      target_value2,
                                                      observations2, doc2)

        # Expect to get nothing due to "min_target_buckets" set to 2
        result = analyzer.evaluate_batch_for_outliers(terms=eval_terms_array)
        self.assertEqual(result, [])
    def test_terms_evaluate_coeff_of_variation_like_expected_document(self):
        self.test_settings.change_configuration_path(
            "/app/tests/unit_tests/files/terms_test_01.conf")
        analyzer = TermsAnalyzer("terms_dummy_test_no_bucket")

        doc_without_outlier = copy.deepcopy(doc_without_outlier_test_file)
        expected_doc = copy.deepcopy(
            doc_with_terms_outlier_coeff_of_variation_no_score_sort)
        # Add doc to the database
        self.test_es.add_doc(doc_without_outlier)

        # Make test (suppose that all doc match with the query)
        analyzer.evaluate_model()

        result = [elem for elem in es.scan()][0]
        self.assertEqual(result, expected_doc)
    def test_evaluate_batch_for_outliers_not_enough_target_buckets_one_doc_max_two(
            self):
        self.test_settings.change_configuration_path(
            "/app/tests/unit_tests/files/terms_test_01.conf")
        analyzer = TermsAnalyzer("terms_dummy_test")

        aggregator_value = LIST_AGGREGATOR_VALUE[0]
        target_value = random.choice(LIST_TARGET_VALUE)
        observations = {}
        doc = copy.deepcopy(random.choice(LIST_DOC))
        eval_terms_array = analyzer.add_term_to_batch(defaultdict(),
                                                      aggregator_value,
                                                      target_value,
                                                      observations, doc)

        result = analyzer.evaluate_batch_for_outliers(terms=eval_terms_array)
        self.assertEqual(result, [])
Beispiel #4
0
    def test_add_document_to_batch_empty_aggergator(self):
        dummy_doc_generate = DummyDocumentsGenerate()
        dummy_doc = dummy_doc_generate.generate_document()

        current_batch = {"dummy_key": "dummy_value"}
        result = TermsAnalyzer._add_document_to_batch(current_batch,
                                                      ["dummy_target"], list(),
                                                      dummy_doc)
        self.assertEqual(result, current_batch)
    def test_terms_generated_document_coeff_of_variation_respect_min(self):
        self.test_settings.change_configuration_path(
            "/app/tests/unit_tests/files/terms_test_01.conf")
        analyzer = TermsAnalyzer("terms_dummy_test_no_bucket")

        doc_generator = DummyDocumentsGenerate()
        nbr_val = 24  # Like 24 hours
        max_trigger_sensitivity = analyzer.model_settings[
            "trigger_sensitivity"]
        default_value = 5  # Per default, 5 documents create per hour (arbitrarily)
        max_difference = 3  # Maximum difference between the number of document (so between 2 and 8 (included))
        all_doc = doc_generator.create_doc_uniq_target_variable_at_most_specific_coef_variation(
            nbr_val, max_trigger_sensitivity, max_difference, default_value)
        self.test_es.add_multiple_docs(all_doc)
        analyzer.evaluate_model()

        nbr_outliers = 0
        for doc in es.scan():
            if "outliers" in doc['_source']:
                nbr_outliers += 1
        self.assertEqual(nbr_outliers, len(all_doc))
    def _test_whitelist_batch_document_not_process_all(
            self):  # TODO FIX with new whitelist system
        self.test_settings.change_configuration_path(
            "/app/tests/unit_tests/files/terms_test_with_whitelist.conf")
        analyzer = TermsAnalyzer("terms_dummy_test")

        # Whitelisted (ignored)
        doc1_without_outlier = copy.deepcopy(
            doc_without_outliers_test_whitelist_01_test_file)
        self.test_es.add_doc(doc1_without_outlier)
        # Not whitelisted (add)
        doc2_without_outlier = copy.deepcopy(
            doc_without_outliers_test_whitelist_02_test_file)
        self.test_es.add_doc(doc2_without_outlier)
        # Not whitelisted
        doc3_without_outlier = copy.deepcopy(
            doc_without_outliers_test_whitelist_03_test_file)
        self.test_es.add_doc(doc3_without_outlier)

        analyzer.evaluate_model()

        self.assertEqual(len(analyzer.outliers), 2)
Beispiel #7
0
    def test_add_document_to_batch_one_aggregator_and_one_target(self):
        dummy_doc_generate = DummyDocumentsGenerate()
        dummy_doc = dummy_doc_generate.generate_document()
        target_value = "dummy_target"
        aggregator_value = "dummy_aggregator"

        current_batch = {"dummy_key": "dummy_value"}
        result = TermsAnalyzer._add_document_to_batch(current_batch,
                                                      [target_value],
                                                      [aggregator_value],
                                                      dummy_doc)

        expected_batch = current_batch.copy()
        expected_batch[aggregator_value] = defaultdict(list)
        expected_batch[aggregator_value]["targets"].append(target_value)
        expected_batch[aggregator_value]["observations"].append(dict())
        expected_batch[aggregator_value]["raw_docs"].append(dummy_doc)

        self.assertEqual(result, expected_batch)
Beispiel #8
0
def perform_analysis():
    """ The entrypoint for analysis """
    analyzers = list()

    for config_section_name in settings.config.sections():
        try:
            if config_section_name.startswith("simplequery_"):
                simplequery_analyzer = SimplequeryAnalyzer(config_section_name=config_section_name)
                analyzers.append(simplequery_analyzer)

            if config_section_name.startswith("metrics_"):
                metrics_analyzer = MetricsAnalyzer(config_section_name=config_section_name)
                analyzers.append(metrics_analyzer)

            if config_section_name.startswith("terms_"):
                terms_analyzer = TermsAnalyzer(config_section_name=config_section_name)
                analyzers.append(terms_analyzer)

            if config_section_name.startswith("beaconing_"):
                beaconing_analyzer = BeaconingAnalyzer(config_section_name=config_section_name)
                analyzers.append(beaconing_analyzer)

            if config_section_name.startswith("word2vec_"):
                word2vec_analyzer = Word2VecAnalyzer(config_section_name=config_section_name)
                analyzers.append(word2vec_analyzer)
        except Exception:
            logging.logger.error(traceback.format_exc())

    analyzers_to_evaluate = list()

    for idx, analyzer in enumerate(analyzers):
        if analyzer.should_run_model or analyzer.should_test_model:
            analyzers_to_evaluate.append(analyzer)

    random.shuffle(analyzers_to_evaluate)
    analyzed_models = 0
    for analyzer in analyzers_to_evaluate:
        try:
            analyzer.evaluate_model()
            analyzed_models = analyzed_models + 1
            logging.logger.info("finished processing use case - " + str(analyzed_models + 1) + "/" + str(len(analyzers_to_evaluate)) + " [" + '{:.2f}'.format(round(float(analyzed_models + 1) / float(len(analyzers_to_evaluate)) * 100, 2)) + "% done" + "]")
        except Exception:
            logging.logger.error(traceback.format_exc())
Beispiel #9
0
def perform_analysis():
    """ The entrypoint for analysis """
    analyzers = list()

    for config_section_name in settings.config.sections():
        _analyzer = None
        try:
            if config_section_name.startswith("simplequery_"):
                _analyzer = SimplequeryAnalyzer(
                    config_section_name=config_section_name)
                analyzers.append(_analyzer)

            elif config_section_name.startswith("metrics_"):
                _analyzer = MetricsAnalyzer(
                    config_section_name=config_section_name)
                analyzers.append(_analyzer)

            elif config_section_name.startswith("terms_"):
                _analyzer = TermsAnalyzer(
                    config_section_name=config_section_name)
                analyzers.append(_analyzer)

            elif config_section_name.startswith("beaconing_"):
                logging.logger.error(
                    "use of the beaconing model is deprecated, please use the terms model using "
                    +
                    "coeff_of_variation trigger method to convert use case " +
                    config_section_name)

            elif config_section_name.startswith("word2vec_"):
                _analyzer = Word2VecAnalyzer(
                    config_section_name=config_section_name)
                analyzers.append(_analyzer)
        except Exception:
            logging.logger.error("error while initializing analyzer " +
                                 config_section_name,
                                 exc_info=True)

    analyzers_to_evaluate = list()

    for analyzer in analyzers:
        if analyzer.should_run_model or analyzer.should_test_model:
            analyzers_to_evaluate.append(analyzer)

    random.shuffle(analyzers_to_evaluate)

    for index, analyzer in enumerate(analyzers_to_evaluate):
        if analyzer.configuration_parsing_error:
            continue

        try:
            analyzer.analysis_start_time = datetime.today().timestamp()
            analyzer.evaluate_model()
            analyzer.analysis_end_time = datetime.today().timestamp()
            analyzer.completed_analysis = True

            logging.logger.info("finished processing use case - " +
                                str(index + 1) + "/" +
                                str(len(analyzers_to_evaluate)) + " [" +
                                '{:.2f}'.format(
                                    round((index + 1) /
                                          float(len(analyzers_to_evaluate)) *
                                          100, 2)) + "% done" + "]")
        except elasticsearch.exceptions.NotFoundError:
            analyzer.index_not_found_analysis = True
            logging.logger.warning(
                "index %s does not exist, skipping use case" %
                analyzer.es_index)
        except Exception:
            analyzer.unknown_error_analysis = True
            logging.logger.error("error while analyzing use case",
                                 exc_info=True)
        finally:
            es.flush_bulk_actions(refresh=True)

    return analyzers_to_evaluate