def search_logs(self, search_req): """Get all logs similar to given logs""" similar_log_ids = set() logger.info("Started searching by request %s", search_req.json()) logger.info("ES Url %s", utils.remove_credentials_from_url(self.host)) t_start = time() if not self.index_exists(str(search_req.projectId)): return [] searched_logs = set() for message in search_req.logMessages: if not message.strip(): continue cleaned_message = self.clean_message(message) sanitized_msg = utils.leave_only_unique_lines( utils.sanitize_text( utils.first_lines(cleaned_message, search_req.logLines))) msg_words = " ".join(utils.split_words(sanitized_msg)) if msg_words in searched_logs: continue searched_logs.add(msg_words) query = self.build_search_query(search_req, sanitized_msg) res = self.es_client.search(index=str(search_req.projectId), body=query) similar_log_ids = similar_log_ids.union( self.find_similar_logs_by_cosine_similarity( msg_words, message, res)) logger.info( "Finished searching by request %s with %d results. It took %.2f sec.", search_req.json(), len(similar_log_ids), time() - t_start) return list(similar_log_ids)
def run(self): for field in self.fields: log_field_ids = {} index_in_message_array = 0 count_vector_matrix = None all_messages = [] for log, res in self.all_results: for obj in [log] + res["hits"]["hits"]: if obj["_id"] not in log_field_ids: text = " ".join( utils.split_words( obj["_source"][field], min_word_length=self.min_word_length)) if text.strip() == "": log_field_ids[obj["_id"]] = -1 else: all_messages.append(text) log_field_ids[ obj["_id"]] = index_in_message_array index_in_message_array += 1 if len(all_messages) > 0: vectorizer = CountVectorizer(binary=True, analyzer="word", token_pattern="[^ ]+") count_vector_matrix = np.asarray( vectorizer.fit_transform(all_messages).toarray()) self.all_text_field_ids[field] = log_field_ids self.dict_count_vectorizer[field] = count_vector_matrix
def message_to_array(self, detected_message_res, stacktrace_res): all_lines = [" ".join(utils.split_words(detected_message_res))] split_log_lines = utils.filter_empty_lines([ " ".join(utils.split_words(line)) for line in stacktrace_res.split("\n") ]) split_log_lines_num = len(split_log_lines) data_in_block = max( self.min_log_number_in_block, math.ceil(split_log_lines_num / self.block_to_split)) blocks_num = math.ceil(split_log_lines_num / data_in_block) for block in range(blocks_num): all_lines.append("\n".join( split_log_lines[block * data_in_block:(block + 1) * data_in_block])) if len([line for line in all_lines if line.strip() != ""]) == 0: return [] return all_lines
def perform_light_deduplication(self, data): text_messages_set = {} logs_to_train_idx = [] additional_logs = {} for idx, text_message_data in enumerate(data): text_message = text_message_data[0] text_message_normalized = " ".join( sorted(utils.split_words(text_message, to_lower=True))) if text_message_normalized not in text_messages_set: logs_to_train_idx.append(idx) text_messages_set[text_message_normalized] = idx additional_logs[idx] = [] else: additional_logs[ text_messages_set[text_message_normalized]].append(idx) return additional_logs, logs_to_train_idx
def prepare_log_words(self, launches): log_words = {} project = None for launch in launches: project = str(launch.project) for test_item in launch.testItems: for log in test_item.logs: if log.logLevel < ERROR_LOGGING_LEVEL or not log.message.strip( ): continue clean_message = self.clean_message(log.message) det_message, stacktrace = utils.detect_log_description_and_stacktrace( clean_message) for word in utils.split_words(stacktrace): if "." in word and len(word.split(".")) > 2: log_words[word] = 1 return log_words, project
def find_similar_logs_by_cosine_similarity(self, msg_words, message, res): similar_log_ids = set() messages_by_ids = {} message_index_id = 1 all_messages = [msg_words] for result in res["hits"]["hits"]: try: log_id = int(re.search(r"\d+", result["_id"]).group(0)) if log_id not in messages_by_ids: log_query_words = " ".join( utils.split_words(result["_source"]["message"])) all_messages.append(log_query_words) messages_by_ids[log_id] = message_index_id message_index_id += 1 except Exception as err: logger.error("Id %s is not integer", result["_id"]) logger.error(err) if all_messages: vectorizer = CountVectorizer(binary=True, analyzer="word", token_pattern="[^ ]+") count_vector_matrix = vectorizer.fit_transform(all_messages) for log_id in messages_by_ids: similarity_percent = round( 1 - spatial.distance.cosine( np.asarray(count_vector_matrix[0].toarray()), np.asarray(count_vector_matrix[ messages_by_ids[log_id]].toarray())), 3) logger.debug( "Log with id %s has %.3f similarity with the log '%s'", log_id, similarity_percent, message) if similarity_percent >= self.search_cfg[ "SearchLogsMinSimilarity"]: similar_log_ids.add(log_id) return similar_log_ids
def find_similarity(self, all_results, fields): for field in fields: if field in self.similarity_dict: continue self.similarity_dict[field] = {} log_field_ids = {} index_in_message_array = 0 count_vector_matrix = None all_messages = [] all_messages_needs_reweighting = [] needs_reweighting_wc = False for log, res in all_results: for obj in [log] + res["hits"]["hits"]: if obj["_id"] not in log_field_ids: if field not in self.artificial_columns and not obj[ "_source"][field].strip(): log_field_ids[obj["_id"]] = -1 else: text = [] needs_reweighting = 0 if self.config["number_of_log_lines"] == -1 and\ field in self.fields_mapping_for_weighting: fields_to_use = self.fields_mapping_for_weighting[ field] text = self.weighted_similarity_calculator.message_to_array( obj["_source"][fields_to_use[0]], obj["_source"][fields_to_use[1]]) elif field == "namespaces_stacktrace": gathered_lines = [] weights = [] for line in obj["_source"]["stacktrace"].split( "\n"): line_words = utils.split_words( line, min_word_length=self. config["min_word_length"]) for word in line_words: part_of_namespace = ".".join( word.split(".")[:2]) if part_of_namespace in self.config[ "chosen_namespaces"]: gathered_lines.append( " ".join(line_words)) weights.append( self. config["chosen_namespaces"] [part_of_namespace]) if obj["_id"] == log["_id"] and len( gathered_lines): text = gathered_lines self.object_id_weights[ obj["_id"]] = weights else: text = [] for line in obj["_source"][ "stacktrace"].split("\n"): text.append(" ".join( utils.split_words( utils.clean_from_brackets( line), min_word_length=self. config["min_word_length"]))) text = utils.filter_empty_lines(text) self.object_id_weights[ obj["_id"]] = [1] * len(text) elif field.startswith("stacktrace"): if utils.does_stacktrace_need_words_reweighting( obj["_source"][field]): needs_reweighting = 1 text = self.weighted_similarity_calculator.message_to_array( "", obj["_source"][field]) else: text = utils.filter_empty_lines([ " ".join( utils.split_words( obj["_source"][field], min_word_length=self. config["min_word_length"])) ]) if not text: log_field_ids[obj["_id"]] = -1 else: all_messages.extend(text) all_messages_needs_reweighting.append( needs_reweighting) log_field_ids[obj["_id"]] = [ index_in_message_array, len(all_messages) - 1 ] index_in_message_array += len(text) if all_messages: needs_reweighting_wc = all_messages_needs_reweighting and\ sum(all_messages_needs_reweighting) == len(all_messages_needs_reweighting) vectorizer = CountVectorizer(binary=not needs_reweighting_wc, analyzer="word", token_pattern="[^ ]+") count_vector_matrix = np.asarray( vectorizer.fit_transform(all_messages).toarray()) for log, res in all_results: sim_dict = self._calculate_field_similarity( log, res, log_field_ids, count_vector_matrix, needs_reweighting_wc, field) for key in sim_dict: self.similarity_dict[field][key] = sim_dict[key]
def search_logs(self, search_req): """Get all logs similar to given logs""" similar_log_ids = set() logger.info("Started searching by request %s", search_req.json()) logger.info("ES Url %s", utils.remove_credentials_from_url(self.es_client.host)) t_start = time() if not self.es_client.index_exists(str(search_req.projectId)): return [] searched_logs = set() test_item_info = {} for message in search_req.logMessages: if not message.strip(): continue queried_log = self.log_preparation._create_log_template() queried_log = self.log_preparation._fill_log_fields( queried_log, Log(logId=0, message=message), search_req.logLines) msg_words = " ".join( utils.split_words(queried_log["_source"]["message"])) if not msg_words.strip() or msg_words in searched_logs: continue searched_logs.add(msg_words) query = self.build_search_query(search_req, queried_log["_source"]["message"]) res = self.es_client.es_client.search(index=str( search_req.projectId), body=query) for es_res in res["hits"]["hits"]: test_item_info[es_res["_id"]] = es_res["_source"]["test_item"] _similarity_calculator = similarity_calculator.SimilarityCalculator( { "max_query_terms": self.search_cfg["MaxQueryTerms"], "min_word_length": self.search_cfg["MinWordLength"], "min_should_match": "90%", "number_of_log_lines": search_req.logLines }, weighted_similarity_calculator=self. weighted_log_similarity_calculator) _similarity_calculator.find_similarity([(queried_log, res)], ["message"]) for group_id, similarity_obj in _similarity_calculator.similarity_dict[ "message"].items(): log_id, _ = group_id similarity_percent = similarity_obj["similarity"] logger.debug( "Log with id %s has %.3f similarity with the queried log '%s'", log_id, similarity_percent, queried_log["_source"]["message"]) if similarity_percent >= self.search_cfg[ "SearchLogsMinSimilarity"]: similar_log_ids.add((utils.extract_real_id(log_id), int(test_item_info[log_id]))) logger.info( "Finished searching by request %s with %d results. It took %.2f sec.", search_req.json(), len(similar_log_ids), time() - t_start) return [ SearchLogInfo(logId=log_info[0], testItemId=log_info[1]) for log_info in similar_log_ids ]
def _prepare_log(self, launch, test_item, log): cleaned_message = self.clean_message(log.message) message = utils.leave_only_unique_lines( utils.sanitize_text( utils.first_lines(cleaned_message, launch.analyzerConfig.numberOfLogLines))) detected_message, stacktrace = utils.detect_log_description_and_stacktrace( cleaned_message, default_log_number=1) detected_message_with_numbers = utils.remove_starting_datetime( detected_message) detected_message = utils.sanitize_text(detected_message) stacktrace = utils.sanitize_text(stacktrace) stacktrace = utils.leave_only_unique_lines(stacktrace) detected_message = utils.leave_only_unique_lines(detected_message) detected_message_with_numbers = utils.leave_only_unique_lines( detected_message_with_numbers) detected_message_only_numbers = utils.find_only_numbers( detected_message_with_numbers) return { "_id": log.logId, "_index": launch.project, "_source": { "launch_id": launch.launchId, "launch_name": launch.launchName, "test_item": test_item.testItemId, "unique_id": test_item.uniqueId, "test_case_hash": test_item.testCaseHash, "is_auto_analyzed": test_item.isAutoAnalyzed, "issue_type": test_item.issueType, "log_level": log.logLevel, "original_message_lines": utils.calculate_line_number(cleaned_message), "original_message_words_number": len(utils.split_words(cleaned_message, split_urls=False)), "message": message, "is_merged": False, "start_time": datetime( *test_item.startTime[:6]).strftime("%Y-%m-%d %H:%M:%S"), "merged_small_logs": "", "detected_message": detected_message, "detected_message_with_numbers": detected_message_with_numbers, "stacktrace": stacktrace, "only_numbers": detected_message_only_numbers } }
def compress(text): return " ".join(utils.split_words(text, only_unique=True))
def _fill_log_fields(self, log_template, log, number_of_lines): cleaned_message = self.clean_message(log.message) message = utils.first_lines(cleaned_message, number_of_lines) message_without_params = message message = utils.sanitize_text(message) message_without_params = utils.clean_from_urls(message_without_params) message_without_params = utils.clean_from_paths(message_without_params) message_without_params = utils.clean_from_params( message_without_params) message_without_params_and_brackets = utils.remove_starting_datetime( message_without_params) message_without_params_and_brackets = utils.clean_from_brackets( message_without_params_and_brackets) message_without_params = utils.sanitize_text(message_without_params) detected_message, stacktrace = utils.detect_log_description_and_stacktrace( cleaned_message) detected_message_without_params = detected_message urls = " ".join(utils.extract_urls(detected_message_without_params)) detected_message_without_params = utils.clean_from_urls( detected_message_without_params) paths = " ".join(utils.extract_paths(detected_message_without_params)) detected_message_without_params = utils.clean_from_paths( detected_message_without_params) potential_status_codes = " ".join( utils.get_potential_status_codes(detected_message_without_params)) message_params = " ".join( utils.extract_message_params(detected_message_without_params)) detected_message_without_params = utils.clean_from_params( detected_message_without_params) detected_message_without_params_and_brackets = utils.remove_starting_datetime( detected_message_without_params) detected_message_without_params_and_brackets = utils.clean_from_brackets( detected_message_without_params_and_brackets) detected_message_without_params = utils.sanitize_text( detected_message_without_params) detected_message_with_numbers = utils.remove_starting_datetime( detected_message) detected_message_only_numbers = utils.find_only_numbers( detected_message_with_numbers) detected_message = utils.sanitize_text(detected_message) stacktrace = utils.sanitize_text(stacktrace) found_exceptions = utils.get_found_exceptions(detected_message) found_exceptions_extended = utils.enrich_found_exceptions( found_exceptions) log_template["_id"] = log.logId log_template["_source"]["cluster_id"] = log.clusterId log_template["_source"]["log_level"] = log.logLevel log_template["_source"][ "original_message_lines"] = utils.calculate_line_number( cleaned_message) log_template["_source"]["original_message_words_number"] = len( utils.split_words(cleaned_message, split_urls=False)) log_template["_source"]["message"] = message log_template["_source"]["detected_message"] = detected_message log_template["_source"][ "detected_message_with_numbers"] = detected_message_with_numbers log_template["_source"]["stacktrace"] = stacktrace log_template["_source"]["only_numbers"] = detected_message_only_numbers log_template["_source"]["urls"] = urls log_template["_source"]["paths"] = paths log_template["_source"]["message_params"] = message_params log_template["_source"]["found_exceptions"] = found_exceptions log_template["_source"][ "found_exceptions_extended"] = found_exceptions_extended log_template["_source"]["detected_message_extended"] =\ utils.enrich_text_with_method_and_classes(detected_message) log_template["_source"]["detected_message_without_params_extended"] =\ utils.enrich_text_with_method_and_classes(detected_message_without_params) log_template["_source"]["stacktrace_extended"] =\ utils.enrich_text_with_method_and_classes(stacktrace) log_template["_source"]["message_extended"] =\ utils.enrich_text_with_method_and_classes(message) log_template["_source"]["message_without_params_extended"] =\ utils.enrich_text_with_method_and_classes(message_without_params) log_template["_source"][ "whole_message"] = detected_message_with_numbers + " \n " + stacktrace log_template["_source"]["detected_message_without_params_and_brackets"] =\ detected_message_without_params_and_brackets log_template["_source"]["message_without_params_and_brackets"] =\ message_without_params_and_brackets log_template["_source"]["potential_status_codes"] =\ potential_status_codes for field in [ "message", "detected_message", "detected_message_with_numbers", "stacktrace", "only_numbers", "found_exceptions", "found_exceptions_extended", "detected_message_extended", "detected_message_without_params_extended", "stacktrace_extended", "message_extended", "message_without_params_extended", "detected_message_without_params_and_brackets", "message_without_params_and_brackets" ]: log_template["_source"][field] = utils.leave_only_unique_lines( log_template["_source"][field]) log_template["_source"][field] = utils.clean_colon_stacking( log_template["_source"][field]) return log_template