def evaluate_model(self): self.total_events, documents = es.count_and_scan_documents(index=self.model_settings["es_index"], search_query=self.search_query, model_settings=self.model_settings) self.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating terms model") if self.total_events > 0: current_batch = defaultdict() targets_for_next_batch = defaultdict() total_targets_in_batch = 0 for doc in documents: logging.tick() target_sentences, aggregator_sentences = self._compute_aggregator_and_target_value( doc, self.model_settings["target"]) if target_sentences is not None and aggregator_sentences is not None: # Add current document to current_batch current_batch = self._add_document_to_batch(current_batch, target_sentences, aggregator_sentences, doc) total_targets_in_batch += len(target_sentences) * len(aggregator_sentences) # Evaluate batch of events against the model is_last_batch = (logging.current_step == self.total_events) # Check if it is the last batch # Run if it is the last batch OR if the batch size is large enough if is_last_batch or total_targets_in_batch >= self.terms_batch_eval_size: # Display log message log_message = "evaluating batch of " + "{:,}".format(total_targets_in_batch) + " terms " if len(targets_for_next_batch) > 0: log_message += "(+ " + "{:,}".format(len(targets_for_next_batch)) + " terms from last batch) " log_message += "[" + "{:,}".format(logging.current_step) + " events processed]" logging.logger.info(log_message) # evaluate the current batch outliers_in_batch, targets_for_next_batch = self._evaluate_batch_for_outliers(batch=current_batch) if outliers_in_batch: unique_summaries_in_batch = len(set(o.outlier_dict["summary"] for o in outliers_in_batch)) logging.logger.info("processing " + "{:,}".format(len(outliers_in_batch)) + " outliers in batch [" + "{:,}".format(unique_summaries_in_batch) + " unique summaries]") for outlier in outliers_in_batch: self.process_outlier(outlier) else: logging.logger.info("no outliers processed in batch") # Reset data structures for next batch current_batch = targets_for_next_batch total_targets_in_batch = 0 self.print_analysis_summary()
def evaluate_model(self): model_filter = { "bool": { "filter": [{ "term": { "outliers.model_name.keyword": { "value": self.model_name } } }, { "term": { "outliers.model_type.keyword": { "value": "simplequery" } } }] } } exclude_hits_filter = {"bool": {"must_not": model_filter}} query = self.search_query if "filter" in query: query["filter"].append(exclude_hits_filter) else: query["filter"] = [exclude_hits_filter] self.total_events, documents = es.count_and_scan_documents( index=self.model_settings["es_index"], search_query=query, model_settings=self.model_settings) self.print_analysis_intro(event_type="evaluating " + self.model_type + "_" + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating " + self.model_type + " model") if self.total_events > 0: for doc in documents: logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) outlier = self.create_outlier(fields, doc) self.process_outlier(outlier) self.print_analysis_summary()
def train_model(self): w2v_model = word2vec.Word2Vec(name=self.model_name) sentences = list() self.total_events, documents = es.count_and_scan_documents( index=self.model_settings["es_index"], search_query=self.search_query, model_settings=self.model_settings) training_data_size_pct = settings.config.getint( "machine_learning", "training_data_size_pct") training_data_size = self.total_events / 100 * training_data_size_pct self.print_analysis_intro(event_type="training " + self.model_name, total_events=self.total_events) total_training_events = int(min(training_data_size, self.total_events)) logging.init_ticker(total_steps=total_training_events, desc=self.model_name + " - preparing word2vec training set") if self.total_events > 0: for doc in documents: if len(sentences) < total_training_events: logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) if set(self.model_settings["sentence_format"]).issubset( fields.keys()): new_sentences = helpers.utils.flatten_fields_into_sentences( fields=fields, sentence_format=self. model_settings["sentence_format"]) for sentence in new_sentences: sentences.append(tuple(sentence)) # Remove all duplicates from sentences for training - REMOVED FOR TESTING # sentences = list(sentences) else: # We have collected sufficient training data break # Now, train the model if len(sentences) > 0: w2v_model.train_model(sentences) else: logging.logger.warning( "no sentences to train model on. Are you sure the sentence configuration is " + "correctly defined?")
def train_model(self): train_data = list() self.total_events, documents = es.count_and_scan_documents( index=self.model_settings["es_index"], search_query=self.search_query, model_settings=self.model_settings) training_data_size_pct = settings.config.getint( "machine_learning", "training_data_size_pct") training_data_size = self.total_events / 100 * training_data_size_pct self.print_analysis_intro(event_type="training " + self.model_name, total_events=self.total_events) total_training_events = int(min(training_data_size, self.total_events)) logging.init_ticker(total_steps=total_training_events, desc=self.model_name + " - preparing training set") if self.total_events > 0: for doc in documents: if len(train_data) < total_training_events: logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) train_data.append(fields) else: # We have collected sufficient training data break # Now, train the model if train_data: pass # Train!! else: logging.logger.warning( "no sentences to train model on. Are you sure the sentence configuration is " + "correctly defined?")
def evaluate_model(self): # Train the model if self.model_settings["train_model"]: self.train_model() return w2v_model = word2vec.Word2Vec(name=self.model_name) search_query = self.search_query if not w2v_model.is_trained(): logging.logger.warning("model was not trained! Skipping analysis.") else: # Check if we need to run the test data instead of real data if w2v_model.use_test_data: logging.print_generic_intro( "using test data instead of live data to evaluate model " + self.model_name) self.evaluate_test_sentences(w2v_model=w2v_model) return self.total_events, documents = es.count_and_scan_documents( index=self.model_settings["es_index"], search_query=search_query, model_settings=self.model_settings) self.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating word2vec model") if self.total_events > 0: raw_docs = list() eval_sentences = list() for doc in documents: logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) try: new_sentences = helpers.utils.flatten_fields_into_sentences( fields=fields, sentence_format=self. model_settings["sentence_format"]) eval_sentences.extend(new_sentences) except KeyError: logging.logger.debug( "skipping event which does not contain the target and aggregator fields " + "we are processing. - [" + self.model_name + "]") continue for _ in new_sentences: raw_docs.append(doc) # Evaluate batch of events against the model if logging.current_step == self.total_events or \ len(eval_sentences) >= settings.config.getint("machine_learning", "word2vec_batch_eval_size"): logging.logger.info("evaluating batch of " + str(len(eval_sentences)) + " sentences") outliers = self.evaluate_batch_for_outliers( w2v_model=w2v_model, eval_sentences=eval_sentences, raw_docs=raw_docs) if len(outliers) > 0: unique_summaries = len( set(o.outlier_dict["summary"] for o in outliers)) logging.logger.info( "total outliers in batch processed: " + "{:,}".format(len(outliers)) + " [" + "{:,}".format(unique_summaries) + " unique summaries]") # Reset data structures for next batch raw_docs = list() eval_sentences = list()
def evaluate_model(self): batch = defaultdict() # Contain the current batch information remaining_metrics = defaultdict() total_metrics_in_batch = 0 self.total_events, documents = es.count_and_scan_documents( index=self.model_settings["es_index"], search_query=self.search_query, model_settings=self.model_settings) self.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating " + self.model_type + " model") if self.total_events > 0: for doc in documents: logging.tick() # Extract target and aggregator values target_value, aggregator_sentences = self._compute_aggregator_and_target_value( doc) # If target and aggregator values exist if target_value is not None and aggregator_sentences is not None: # Add current document to eval_metrics batch, metric_added = self._add_document_to_batch( doc, batch, target_value, aggregator_sentences) # We can only have 1 target field for metrics (as opposed to terms), so the total number of targets # added is the same as the total number of aggregator sentences that were processed for this # document if metric_added: total_metrics_in_batch += len(aggregator_sentences) is_last_batch = (logging.current_step == self.total_events ) # Check if it is the last batch # Run if it is the last batch OR if the batch size is large enough if is_last_batch or total_metrics_in_batch >= self.metrics_batch_eval_size: # Display log message log_message = "evaluating batch of " + "{:,}".format( total_metrics_in_batch) + " metrics " if remaining_metrics: log_message += "(+ " + "{:,}".format( len(remaining_metrics) ) + " metrics from last batch) " log_message += "[" + "{:,}".format( logging.current_step) + " events processed]" logging.logger.info(log_message) outliers_in_batch, remaining_metrics = self._evaluate_batch_for_outliers( batch=batch, is_last_batch=is_last_batch) # For each result, save it in batch and in ES if outliers_in_batch: unique_summaries_in_batch = len( set(o.outlier_dict["summary"] for o in outliers_in_batch)) logging.logger.info( "processing " + "{:,}".format(len(outliers_in_batch)) + " outliers in batch [" + "{:,}".format(unique_summaries_in_batch) + " unique summaries]") for outlier in outliers_in_batch: self.process_outlier(outlier) else: logging.logger.info("no outliers detected in batch") # Reset data structures for next batch batch = remaining_metrics total_metrics_in_batch = 0 self.print_analysis_summary()