Ejemplo n.º 1
0
def print_intro():
    """
    Print the banner information including version, loaded configuration files and any parsing errors
    that might have occurred when loading them.
    """
    logging.logger.info(
        "outliers.py - version %s - contact: [email protected]",
        EE_OUTLIERS_VERSIONS)
    logging.logger.info("run mode: %s", settings.args.run_mode)

    logging.print_generic_intro("initializing")
    logging.logger.info("loaded %d configuration files",
                        len(settings.loaded_config_paths))

    if settings.failed_config_paths:
        logging.logger.error(
            "failed to load %d configuration files that will be "
            "ignored", len(settings.failed_config_paths))

        for failed_config_path in settings.failed_config_paths:
            logging.logger.error("\t+ failed to load configuration file %s",
                                 failed_config_path)

    if settings.failing_regular_expressions:
        logging.logger.error(
            "failed to parse %d regular expressions in whitelist that "
            "will be ignored", len(settings.failing_regular_expressions))

        for failed_regular_expression in settings.failing_regular_expressions:
            logging.logger.error("\t+ failed to parse regular expression %s",
                                 failed_regular_expression)
Ejemplo n.º 2
0
    def evaluate_model(self):
        self.extract_extra_model_settings()

        # Train the model
        if self.model_settings["train_model"]:
            self.train_model()
            return

        w2v_model = word2vec.Word2Vec(name=self.model_name)
        search_query = es.filter_by_query_string(self.model_settings["es_query_filter"])

        if not w2v_model.is_trained():
            logging.logger.warning("model was not trained! Skipping analysis.")
        else:
            # Check if we need to run the test data instead of real data
            if w2v_model.use_test_data:
                logging.print_generic_intro("using test data instead of live data to evaluate model " + self.model_name)
                self.evaluate_test_sentences(w2v_model=w2v_model)
                return

            self.total_events = es.count_documents(search_query=search_query)
            logging.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events)

            logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating word2vec model")

            raw_docs = list()
            eval_sentences = list()

            for doc in es.scan(search_query=search_query):
                logging.tick()
                fields = es.extract_fields_from_document(doc)

                try:
                    new_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["sentence_format"])
                    eval_sentences.extend(new_sentences)
                except KeyError:
                    logging.logger.debug("skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]")
                    continue

                for _ in new_sentences:
                    raw_docs.append(doc)

                # Evaluate batch of events against the model
                if logging.current_step == self.total_events or len(eval_sentences) >= settings.config.getint("machine_learning", "word2vec_batch_eval_size"):
                    logging.logger.info("evaluating batch of " + str(len(eval_sentences)) + " sentences")
                    outliers = self.evaluate_batch_for_outliers(w2v_model=w2v_model, eval_sentences=eval_sentences, raw_docs=raw_docs)

                    if len(outliers) > 0:
                        unique_summaries = len(set(o.outlier_dict["summary"] for o in outliers))
                        logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]")

                    # Reset data structures for next batch
                    raw_docs = list()
                    eval_sentences = list()
Ejemplo n.º 3
0
def print_intro():
    logging.logger.info("outliers.py started - contact: [email protected]")
    logging.logger.info("run mode: " + settings.args.run_mode)

    logging.print_generic_intro("initializing")
    logging.logger.info("loaded " + str(len(settings.loaded_config_paths)) +
                        " configuration files")

    if settings.failed_config_paths:
        logging.logger.warning("failed to load " +
                               str(len(settings.failed_config_paths)) +
                               " configuration files")

        for failed_config_path in settings.failed_config_paths:
            logging.logger.warning("failed to load " + str(failed_config_path))
Ejemplo n.º 4
0
def run_daemon_mode():
    """
    Run outliers in daemon mode.
    In this mode, outliers will continue running based on the cron scheduled defined in the configuration file.
    """

    # In daemon mode, we also want to monitor the configuration file for changes.
    # In case of a change, we need to make sure that we are using this new configuration file
    for config_file in settings.args.config:
        logging.logger.info("monitoring configuration file %s for changes",
                            config_file)

    # Monitor configuration files for potential changes
    file_mod_watcher = FileModificationWatcher()
    file_mod_watcher.add_files(settings.args.config)

    # Initialize Elasticsearch connection
    while not es.init_connection():
        time.sleep(60)

    # Create housekeeping job, don't start it yet
    housekeeping_job = HousekeepingJob()

    first_run = True
    run_succeeded_without_errors = None

    # The daemon should run forever, until the user kills it
    while True:
        next_run = None
        should_schedule_next_run = False

        # This loop will run for as long we don't need to perform an analysis
        while (next_run is None or datetime.now() < next_run) and first_run is False and \
                run_succeeded_without_errors is True:

            # Check if we already know when to perform the analysis next; if not, we need to schedule it
            if next_run is None:
                should_schedule_next_run = True

            # Check for configuration file changes and load them in case it's needed
            if file_mod_watcher.files_changed():
                logging.logger.info("configuration file changed, reloading")
                settings.process_configuration_files()
                should_schedule_next_run = True

            # Schedule a next rune based on the cron schedule defined in the configuration file
            if should_schedule_next_run:
                next_run = croniter(settings.config.get("daemon", "schedule"),
                                    datetime.now()).get_next(datetime)
                logging.logger.info(
                    "next run scheduled on {0:%Y-%m-%d %H:%M:%S}".format(
                        next_run))
                should_schedule_next_run = False

            # Wait 5 seconds before checking the cron schedule again
            time.sleep(5)

        # Refresh settings in case the cron has changed for example
        settings.process_configuration_files()

        # On the first run, we might have to wipe all the existing outliers if this is set in the configuration file
        if first_run:
            first_run = False
            logging.logger.info(
                "first run, so we will start immediately - after this, we will respect the cron "
                "schedule defined in the configuration file")

            # Wipe all existing outliers if needed
            if settings.config.getboolean("general",
                                          "es_wipe_all_existing_outliers"):
                logging.logger.info(
                    "wiping all existing outliers on first run")
                es.remove_all_outliers()

        # Make sure we are still connected to Elasticsearch before analyzing, in case something went wrong with
        # the connection in between runs
        while not es.init_connection():
            time.sleep(60)

        # Make sure housekeeping is up and running
        if not housekeeping_job.is_alive():
            housekeeping_job.start()

        # Perform analysis and print the analysis summary at the end
        logging.print_generic_intro("starting outlier detection")
        analyzed_models = perform_analysis(housekeeping_job)
        print_analysis_summary(analyzed_models)

        errored_models = [
            analyzer for analyzer in analyzed_models
            if analyzer.unknown_error_analysis
        ]

        # Check the result of the analysis. In case an error occured, we want to re-run right away (after a minute)
        if errored_models:
            run_succeeded_without_errors = False
            logging.logger.warning(
                "ran into errors while analyzing use cases - not going to wait for the cron "
                "schedule, we just start analyzing again after sleeping for a minute first"
            )
            time.sleep(60)
        else:
            run_succeeded_without_errors = True

        logging.print_generic_intro("finished performing outlier detection")
Ejemplo n.º 5
0
def run_daemon_mode():
    # In daemon mode, we also want to monitor the configuration file for changes.
    # In case of a change, we need to make sure that we are using this new configuration file
    for config_file in settings.args.config:
        logging.logger.info("monitoring configuration file " + config_file +
                            " for changes")

    file_mod_watcher = FileModificationWatcher()
    file_mod_watcher.add_files(settings.args.config)

    # Initialize Elasticsearch connection
    es.init_connection()

    # Create housekeeping job, don't start it yet
    housekeeping_job = HousekeepingJob()

    num_runs = 0
    first_run = True
    run_succeeded_without_errors = None

    while True:
        num_runs += 1
        next_run = None
        should_schedule_next_run = False

        while (next_run is None or datetime.now() < next_run) and first_run is False and \
                run_succeeded_without_errors is True:
            if next_run is None:
                should_schedule_next_run = True

            # Check for configuration file changes and load them in case it's needed
            if file_mod_watcher.files_changed():
                logging.logger.info("configuration file changed, reloading")
                settings.process_configuration_files()
                should_schedule_next_run = True

            if should_schedule_next_run:
                next_run = croniter(settings.config.get("daemon", "schedule"),
                                    datetime.now()).get_next(datetime)
                logging.logger.info(
                    "next run scheduled on {0:%Y-%m-%d %H:%M:%S}".format(
                        next_run))
                should_schedule_next_run = False

            time.sleep(5)

        settings.process_configuration_files()  # Refresh settings

        if first_run:
            first_run = False
            logging.logger.info(
                "first run, so we will start immediately - after this, we will respect the cron "
                + "schedule defined in the configuration file")

            # Wipe all existing outliers if needed
            if settings.config.getboolean("general",
                                          "es_wipe_all_existing_outliers"):
                logging.logger.info(
                    "wiping all existing outliers on first run")
                es.remove_all_outliers()
        else:
            # Make sure we are still connected to Elasticsearch before analyzing, in case something went wrong with
            # the connection in between runs
            es.init_connection()

        # Make sure housekeeping is up and running
        if not housekeeping_job.is_alive():
            housekeeping_job.start()

        # Perform analysis
        logging.print_generic_intro("starting outlier detection")
        analyzed_models = perform_analysis()
        print_analysis_summary(analyzed_models)

        errored_models = [
            analyzer for analyzer in analyzed_models
            if analyzer.unknown_error_analysis
        ]

        # Check the result of the analysis
        if errored_models:
            run_succeeded_without_errors = False
            logging.logger.warning(
                "ran into errors while analyzing use cases - not going to wait for the cron "
                +
                "schedule, we just start analyzing again after sleeping for a minute first"
            )
            time.sleep(60)
        else:
            run_succeeded_without_errors = True

        logging.print_generic_intro("finished performing outlier detection")
Ejemplo n.º 6
0
    "machine_learning", "tensorflow_log_level")

# Log Handlers
LOG_FILE = settings.config.get("general", "log_file")

if os.path.exists(os.path.dirname(LOG_FILE)):
    logging.add_file_handler(LOG_FILE)
else:
    logging.logger.warning(
        "log directory for log file %s does not exist, check your settings! Only logging to stdout.",
        LOG_FILE)

logging.logger.info("outliers.py started - contact: [email protected]")
logging.logger.info("run mode: " + settings.args.run_mode)

logging.print_generic_intro("initializing")
logging.logger.info("loaded " + str(len(settings.loaded_config_paths)) +
                    " configuration files")

if settings.failed_config_paths:
    logging.logger.warning("failed to load " +
                           str(len(settings.failed_config_paths)) +
                           " configuration files")

    for failed_config_path in settings.failed_config_paths:
        logging.logger.warning("failed to load " + str(failed_config_path))


def perform_analysis():
    """ The entrypoint for analysis """
    analyzers = list()
Ejemplo n.º 7
0
logging.verbosity = settings.config.getint("general", "log_verbosity")
logging.logger.setLevel(settings.config.get("general", "log_level"))
os.environ['TF_CPP_MIN_LOG_LEVEL'] = settings.config.get("machine_learning", "tensorflow_log_level")

# Log Handlers
LOG_FILE = settings.config.get("general", "log_file")

if os.path.exists(os.path.dirname(LOG_FILE)):
    logging.add_file_handler(LOG_FILE)
else:
    logging.logger.warning("log directory for log file %s does not exist, check your settings! Only logging to stdout.", LOG_FILE)

logging.logger.info("outliers.py started - contact: [email protected]")
logging.logger.info("run mode: " + settings.args.run_mode)

logging.print_generic_intro("initializing")
logging.logger.info("loaded " + str(len(settings.loaded_config_paths)) + " configuration files")

if settings.failed_config_paths:
    logging.logger.warning("failed to load " + str(len(settings.failed_config_paths)) + " configuration files")

    for failed_config_path in settings.failed_config_paths:
        logging.logger.warning("failed to load " + str(failed_config_path))


def perform_analysis():
    """ The entrypoint for analysis """
    analyzers = list()

    for config_section_name in settings.config.sections():
        try: