Esempio n. 1
0
    def search_logs(self, search_req):
        """Get all logs similar to given logs"""
        similar_log_ids = set()
        logger.info("Started searching by request %s", search_req.json())
        logger.info("ES Url %s", utils.remove_credentials_from_url(self.host))
        t_start = time()
        if not self.index_exists(str(search_req.projectId)):
            return []
        searched_logs = set()
        for message in search_req.logMessages:
            if not message.strip():
                continue
            cleaned_message = self.clean_message(message)
            sanitized_msg = utils.leave_only_unique_lines(
                utils.sanitize_text(
                    utils.first_lines(cleaned_message, search_req.logLines)))

            msg_words = " ".join(utils.split_words(sanitized_msg))
            if msg_words in searched_logs:
                continue
            searched_logs.add(msg_words)
            query = self.build_search_query(search_req, sanitized_msg)
            res = self.es_client.search(index=str(search_req.projectId),
                                        body=query)
            similar_log_ids = similar_log_ids.union(
                self.find_similar_logs_by_cosine_similarity(
                    msg_words, message, res))

        logger.info(
            "Finished searching by request %s with %d results. It took %.2f sec.",
            search_req.json(), len(similar_log_ids),
            time() - t_start)
        return list(similar_log_ids)
Esempio n. 2
0
 def run(self):
     for field in self.fields:
         log_field_ids = {}
         index_in_message_array = 0
         count_vector_matrix = None
         all_messages = []
         for log, res in self.all_results:
             for obj in [log] + res["hits"]["hits"]:
                 if obj["_id"] not in log_field_ids:
                     text = " ".join(
                         utils.split_words(
                             obj["_source"][field],
                             min_word_length=self.min_word_length))
                     if text.strip() == "":
                         log_field_ids[obj["_id"]] = -1
                     else:
                         all_messages.append(text)
                         log_field_ids[
                             obj["_id"]] = index_in_message_array
                         index_in_message_array += 1
         if len(all_messages) > 0:
             vectorizer = CountVectorizer(binary=True,
                                          analyzer="word",
                                          token_pattern="[^ ]+")
             count_vector_matrix = np.asarray(
                 vectorizer.fit_transform(all_messages).toarray())
         self.all_text_field_ids[field] = log_field_ids
         self.dict_count_vectorizer[field] = count_vector_matrix
    def message_to_array(self, detected_message_res, stacktrace_res):
        all_lines = [" ".join(utils.split_words(detected_message_res))]
        split_log_lines = utils.filter_empty_lines([
            " ".join(utils.split_words(line))
            for line in stacktrace_res.split("\n")
        ])
        split_log_lines_num = len(split_log_lines)
        data_in_block = max(
            self.min_log_number_in_block,
            math.ceil(split_log_lines_num / self.block_to_split))
        blocks_num = math.ceil(split_log_lines_num / data_in_block)

        for block in range(blocks_num):
            all_lines.append("\n".join(
                split_log_lines[block * data_in_block:(block + 1) *
                                data_in_block]))
        if len([line for line in all_lines if line.strip() != ""]) == 0:
            return []
        return all_lines
 def perform_light_deduplication(self, data):
     text_messages_set = {}
     logs_to_train_idx = []
     additional_logs = {}
     for idx, text_message_data in enumerate(data):
         text_message = text_message_data[0]
         text_message_normalized = " ".join(
             sorted(utils.split_words(text_message, to_lower=True)))
         if text_message_normalized not in text_messages_set:
             logs_to_train_idx.append(idx)
             text_messages_set[text_message_normalized] = idx
             additional_logs[idx] = []
         else:
             additional_logs[
                 text_messages_set[text_message_normalized]].append(idx)
     return additional_logs, logs_to_train_idx
Esempio n. 5
0
    def prepare_log_words(self, launches):
        log_words = {}
        project = None
        for launch in launches:
            project = str(launch.project)
            for test_item in launch.testItems:
                for log in test_item.logs:

                    if log.logLevel < ERROR_LOGGING_LEVEL or not log.message.strip(
                    ):
                        continue
                    clean_message = self.clean_message(log.message)
                    det_message, stacktrace = utils.detect_log_description_and_stacktrace(
                        clean_message)
                    for word in utils.split_words(stacktrace):
                        if "." in word and len(word.split(".")) > 2:
                            log_words[word] = 1
        return log_words, project
Esempio n. 6
0
    def find_similar_logs_by_cosine_similarity(self, msg_words, message, res):
        similar_log_ids = set()
        messages_by_ids = {}
        message_index_id = 1
        all_messages = [msg_words]

        for result in res["hits"]["hits"]:
            try:
                log_id = int(re.search(r"\d+", result["_id"]).group(0))
                if log_id not in messages_by_ids:
                    log_query_words = " ".join(
                        utils.split_words(result["_source"]["message"]))
                    all_messages.append(log_query_words)
                    messages_by_ids[log_id] = message_index_id
                    message_index_id += 1
            except Exception as err:
                logger.error("Id %s is not integer", result["_id"])
                logger.error(err)
        if all_messages:
            vectorizer = CountVectorizer(binary=True,
                                         analyzer="word",
                                         token_pattern="[^ ]+")
            count_vector_matrix = vectorizer.fit_transform(all_messages)
            for log_id in messages_by_ids:
                similarity_percent = round(
                    1 - spatial.distance.cosine(
                        np.asarray(count_vector_matrix[0].toarray()),
                        np.asarray(count_vector_matrix[
                            messages_by_ids[log_id]].toarray())), 3)
                logger.debug(
                    "Log with id %s has %.3f similarity with the log '%s'",
                    log_id, similarity_percent, message)
                if similarity_percent >= self.search_cfg[
                        "SearchLogsMinSimilarity"]:
                    similar_log_ids.add(log_id)
        return similar_log_ids
Esempio n. 7
0
 def find_similarity(self, all_results, fields):
     for field in fields:
         if field in self.similarity_dict:
             continue
         self.similarity_dict[field] = {}
         log_field_ids = {}
         index_in_message_array = 0
         count_vector_matrix = None
         all_messages = []
         all_messages_needs_reweighting = []
         needs_reweighting_wc = False
         for log, res in all_results:
             for obj in [log] + res["hits"]["hits"]:
                 if obj["_id"] not in log_field_ids:
                     if field not in self.artificial_columns and not obj[
                             "_source"][field].strip():
                         log_field_ids[obj["_id"]] = -1
                     else:
                         text = []
                         needs_reweighting = 0
                         if self.config["number_of_log_lines"] == -1 and\
                                 field in self.fields_mapping_for_weighting:
                             fields_to_use = self.fields_mapping_for_weighting[
                                 field]
                             text = self.weighted_similarity_calculator.message_to_array(
                                 obj["_source"][fields_to_use[0]],
                                 obj["_source"][fields_to_use[1]])
                         elif field == "namespaces_stacktrace":
                             gathered_lines = []
                             weights = []
                             for line in obj["_source"]["stacktrace"].split(
                                     "\n"):
                                 line_words = utils.split_words(
                                     line,
                                     min_word_length=self.
                                     config["min_word_length"])
                                 for word in line_words:
                                     part_of_namespace = ".".join(
                                         word.split(".")[:2])
                                     if part_of_namespace in self.config[
                                             "chosen_namespaces"]:
                                         gathered_lines.append(
                                             " ".join(line_words))
                                         weights.append(
                                             self.
                                             config["chosen_namespaces"]
                                             [part_of_namespace])
                             if obj["_id"] == log["_id"] and len(
                                     gathered_lines):
                                 text = gathered_lines
                                 self.object_id_weights[
                                     obj["_id"]] = weights
                             else:
                                 text = []
                                 for line in obj["_source"][
                                         "stacktrace"].split("\n"):
                                     text.append(" ".join(
                                         utils.split_words(
                                             utils.clean_from_brackets(
                                                 line),
                                             min_word_length=self.
                                             config["min_word_length"])))
                                 text = utils.filter_empty_lines(text)
                                 self.object_id_weights[
                                     obj["_id"]] = [1] * len(text)
                         elif field.startswith("stacktrace"):
                             if utils.does_stacktrace_need_words_reweighting(
                                     obj["_source"][field]):
                                 needs_reweighting = 1
                             text = self.weighted_similarity_calculator.message_to_array(
                                 "", obj["_source"][field])
                         else:
                             text = utils.filter_empty_lines([
                                 " ".join(
                                     utils.split_words(
                                         obj["_source"][field],
                                         min_word_length=self.
                                         config["min_word_length"]))
                             ])
                         if not text:
                             log_field_ids[obj["_id"]] = -1
                         else:
                             all_messages.extend(text)
                             all_messages_needs_reweighting.append(
                                 needs_reweighting)
                             log_field_ids[obj["_id"]] = [
                                 index_in_message_array,
                                 len(all_messages) - 1
                             ]
                             index_in_message_array += len(text)
         if all_messages:
             needs_reweighting_wc = all_messages_needs_reweighting and\
                 sum(all_messages_needs_reweighting) == len(all_messages_needs_reweighting)
             vectorizer = CountVectorizer(binary=not needs_reweighting_wc,
                                          analyzer="word",
                                          token_pattern="[^ ]+")
             count_vector_matrix = np.asarray(
                 vectorizer.fit_transform(all_messages).toarray())
         for log, res in all_results:
             sim_dict = self._calculate_field_similarity(
                 log, res, log_field_ids, count_vector_matrix,
                 needs_reweighting_wc, field)
             for key in sim_dict:
                 self.similarity_dict[field][key] = sim_dict[key]
Esempio n. 8
0
    def search_logs(self, search_req):
        """Get all logs similar to given logs"""
        similar_log_ids = set()
        logger.info("Started searching by request %s", search_req.json())
        logger.info("ES Url %s",
                    utils.remove_credentials_from_url(self.es_client.host))
        t_start = time()
        if not self.es_client.index_exists(str(search_req.projectId)):
            return []
        searched_logs = set()
        test_item_info = {}

        for message in search_req.logMessages:
            if not message.strip():
                continue

            queried_log = self.log_preparation._create_log_template()
            queried_log = self.log_preparation._fill_log_fields(
                queried_log, Log(logId=0, message=message),
                search_req.logLines)

            msg_words = " ".join(
                utils.split_words(queried_log["_source"]["message"]))
            if not msg_words.strip() or msg_words in searched_logs:
                continue
            searched_logs.add(msg_words)
            query = self.build_search_query(search_req,
                                            queried_log["_source"]["message"])
            res = self.es_client.es_client.search(index=str(
                search_req.projectId),
                                                  body=query)
            for es_res in res["hits"]["hits"]:
                test_item_info[es_res["_id"]] = es_res["_source"]["test_item"]

            _similarity_calculator = similarity_calculator.SimilarityCalculator(
                {
                    "max_query_terms": self.search_cfg["MaxQueryTerms"],
                    "min_word_length": self.search_cfg["MinWordLength"],
                    "min_should_match": "90%",
                    "number_of_log_lines": search_req.logLines
                },
                weighted_similarity_calculator=self.
                weighted_log_similarity_calculator)
            _similarity_calculator.find_similarity([(queried_log, res)],
                                                   ["message"])

            for group_id, similarity_obj in _similarity_calculator.similarity_dict[
                    "message"].items():
                log_id, _ = group_id
                similarity_percent = similarity_obj["similarity"]
                logger.debug(
                    "Log with id %s has %.3f similarity with the queried log '%s'",
                    log_id, similarity_percent,
                    queried_log["_source"]["message"])
                if similarity_percent >= self.search_cfg[
                        "SearchLogsMinSimilarity"]:
                    similar_log_ids.add((utils.extract_real_id(log_id),
                                         int(test_item_info[log_id])))

        logger.info(
            "Finished searching by request %s with %d results. It took %.2f sec.",
            search_req.json(), len(similar_log_ids),
            time() - t_start)
        return [
            SearchLogInfo(logId=log_info[0], testItemId=log_info[1])
            for log_info in similar_log_ids
        ]
Esempio n. 9
0
    def _prepare_log(self, launch, test_item, log):
        cleaned_message = self.clean_message(log.message)

        message = utils.leave_only_unique_lines(
            utils.sanitize_text(
                utils.first_lines(cleaned_message,
                                  launch.analyzerConfig.numberOfLogLines)))

        detected_message, stacktrace = utils.detect_log_description_and_stacktrace(
            cleaned_message, default_log_number=1)

        detected_message_with_numbers = utils.remove_starting_datetime(
            detected_message)
        detected_message = utils.sanitize_text(detected_message)
        stacktrace = utils.sanitize_text(stacktrace)

        stacktrace = utils.leave_only_unique_lines(stacktrace)
        detected_message = utils.leave_only_unique_lines(detected_message)
        detected_message_with_numbers = utils.leave_only_unique_lines(
            detected_message_with_numbers)

        detected_message_only_numbers = utils.find_only_numbers(
            detected_message_with_numbers)

        return {
            "_id": log.logId,
            "_index": launch.project,
            "_source": {
                "launch_id":
                launch.launchId,
                "launch_name":
                launch.launchName,
                "test_item":
                test_item.testItemId,
                "unique_id":
                test_item.uniqueId,
                "test_case_hash":
                test_item.testCaseHash,
                "is_auto_analyzed":
                test_item.isAutoAnalyzed,
                "issue_type":
                test_item.issueType,
                "log_level":
                log.logLevel,
                "original_message_lines":
                utils.calculate_line_number(cleaned_message),
                "original_message_words_number":
                len(utils.split_words(cleaned_message, split_urls=False)),
                "message":
                message,
                "is_merged":
                False,
                "start_time":
                datetime(
                    *test_item.startTime[:6]).strftime("%Y-%m-%d %H:%M:%S"),
                "merged_small_logs":
                "",
                "detected_message":
                detected_message,
                "detected_message_with_numbers":
                detected_message_with_numbers,
                "stacktrace":
                stacktrace,
                "only_numbers":
                detected_message_only_numbers
            }
        }
Esempio n. 10
0
 def compress(text):
     return " ".join(utils.split_words(text, only_unique=True))
Esempio n. 11
0
    def _fill_log_fields(self, log_template, log, number_of_lines):
        cleaned_message = self.clean_message(log.message)

        message = utils.first_lines(cleaned_message, number_of_lines)
        message_without_params = message
        message = utils.sanitize_text(message)

        message_without_params = utils.clean_from_urls(message_without_params)
        message_without_params = utils.clean_from_paths(message_without_params)
        message_without_params = utils.clean_from_params(
            message_without_params)
        message_without_params_and_brackets = utils.remove_starting_datetime(
            message_without_params)
        message_without_params_and_brackets = utils.clean_from_brackets(
            message_without_params_and_brackets)
        message_without_params = utils.sanitize_text(message_without_params)

        detected_message, stacktrace = utils.detect_log_description_and_stacktrace(
            cleaned_message)

        detected_message_without_params = detected_message
        urls = " ".join(utils.extract_urls(detected_message_without_params))
        detected_message_without_params = utils.clean_from_urls(
            detected_message_without_params)
        paths = " ".join(utils.extract_paths(detected_message_without_params))
        detected_message_without_params = utils.clean_from_paths(
            detected_message_without_params)
        potential_status_codes = " ".join(
            utils.get_potential_status_codes(detected_message_without_params))
        message_params = " ".join(
            utils.extract_message_params(detected_message_without_params))
        detected_message_without_params = utils.clean_from_params(
            detected_message_without_params)
        detected_message_without_params_and_brackets = utils.remove_starting_datetime(
            detected_message_without_params)
        detected_message_without_params_and_brackets = utils.clean_from_brackets(
            detected_message_without_params_and_brackets)
        detected_message_without_params = utils.sanitize_text(
            detected_message_without_params)

        detected_message_with_numbers = utils.remove_starting_datetime(
            detected_message)
        detected_message_only_numbers = utils.find_only_numbers(
            detected_message_with_numbers)
        detected_message = utils.sanitize_text(detected_message)
        stacktrace = utils.sanitize_text(stacktrace)
        found_exceptions = utils.get_found_exceptions(detected_message)
        found_exceptions_extended = utils.enrich_found_exceptions(
            found_exceptions)

        log_template["_id"] = log.logId
        log_template["_source"]["cluster_id"] = log.clusterId
        log_template["_source"]["log_level"] = log.logLevel
        log_template["_source"][
            "original_message_lines"] = utils.calculate_line_number(
                cleaned_message)
        log_template["_source"]["original_message_words_number"] = len(
            utils.split_words(cleaned_message, split_urls=False))
        log_template["_source"]["message"] = message
        log_template["_source"]["detected_message"] = detected_message
        log_template["_source"][
            "detected_message_with_numbers"] = detected_message_with_numbers
        log_template["_source"]["stacktrace"] = stacktrace
        log_template["_source"]["only_numbers"] = detected_message_only_numbers
        log_template["_source"]["urls"] = urls
        log_template["_source"]["paths"] = paths
        log_template["_source"]["message_params"] = message_params
        log_template["_source"]["found_exceptions"] = found_exceptions
        log_template["_source"][
            "found_exceptions_extended"] = found_exceptions_extended
        log_template["_source"]["detected_message_extended"] =\
            utils.enrich_text_with_method_and_classes(detected_message)
        log_template["_source"]["detected_message_without_params_extended"] =\
            utils.enrich_text_with_method_and_classes(detected_message_without_params)
        log_template["_source"]["stacktrace_extended"] =\
            utils.enrich_text_with_method_and_classes(stacktrace)
        log_template["_source"]["message_extended"] =\
            utils.enrich_text_with_method_and_classes(message)
        log_template["_source"]["message_without_params_extended"] =\
            utils.enrich_text_with_method_and_classes(message_without_params)
        log_template["_source"][
            "whole_message"] = detected_message_with_numbers + " \n " + stacktrace
        log_template["_source"]["detected_message_without_params_and_brackets"] =\
            detected_message_without_params_and_brackets
        log_template["_source"]["message_without_params_and_brackets"] =\
            message_without_params_and_brackets
        log_template["_source"]["potential_status_codes"] =\
            potential_status_codes

        for field in [
                "message", "detected_message", "detected_message_with_numbers",
                "stacktrace", "only_numbers", "found_exceptions",
                "found_exceptions_extended", "detected_message_extended",
                "detected_message_without_params_extended",
                "stacktrace_extended", "message_extended",
                "message_without_params_extended",
                "detected_message_without_params_and_brackets",
                "message_without_params_and_brackets"
        ]:
            log_template["_source"][field] = utils.leave_only_unique_lines(
                log_template["_source"][field])
            log_template["_source"][field] = utils.clean_colon_stacking(
                log_template["_source"][field])
        return log_template