def test_create_problems_removes_empty_problems(self): """ Test create problems removes problems without reports """ p = Problem(first_occurrence=datetime.date.today(), last_occurrence=datetime.date.today()) self.db.session.add(p) self.db.session.flush() self.call_action("create-problems") self.assertEqual(self.db.session.query(Problem).count(), 0)
def _create_problems(self, db, problemplugin, report_min_count=0, speedup=False): if speedup: db_reports = get_reports_for_problems(db, problemplugin.name) db_reports += get_unassigned_reports(db, problemplugin.name, min_count=report_min_count) else: db_reports = get_reports_by_type(db, problemplugin.name, min_count=report_min_count) db_problems = get_problems(db) # dict to get db_problem by problem_id self.log_debug("Creating problem reuse dict") problems_dict = {} for db_problem in db_problems: problems_dict[db_problem.id] = db_problem # dict to get report_ids by problem_id problem_report = defaultdict(list) for db_report in db_reports: if db_report.problem_id is not None: problem_report[db_report.problem_id].append(db_report.id) # create lookup dict for problems reuse_problems = {} for (problem_id, report_ids) in problem_report.items(): reuse_problems[tuple(sorted(report_ids))] = problem_id invalid_report_ids_to_clean = [] problems = [] if not db_reports: self.log_info("No reports found") elif len(db_reports) == 1: db_report = db_reports[0] if db_report.problem is None: problems.append([db_report]) else: report_map = {} _satyr_reports = [] i = 0 for db_report in db_reports: i += 1 self.log_debug("[{0} / {1}] Loading report #{2}" .format(i, len(db_reports), db_report.id)) _satyr_report = problemplugin._db_report_to_satyr(db_report) if _satyr_report is None: self.log_debug("Unable to create satyr report") if db_report.problem_id is not None: invalid_report_ids_to_clean.append(db_report.id) else: _satyr_reports.append(_satyr_report) report_map[_satyr_report] = db_report db.session.expire(db_report) self.log_debug("Clustering") clusters = self._create_clusters(_satyr_reports, 2000) # Threads that share no function with another thread unique_func_threads = set(_satyr_reports) - set().union(*clusters) dendrograms = [] i = 0 for cluster in clusters: i += 1 self.log_debug("[{0} / {1}] Computing distances" .format(i, len(clusters))) distances = satyr.Distances(cluster, len(cluster)) self.log_debug("Getting dendrogram") dendrograms.append(satyr.Dendrogram(distances)) for dendrogram, cluster in zip(dendrograms, clusters): problem = [] for dups in dendrogram.cut(0.3, 1): reports = set(report_map[cluster[dup]] for dup in dups) problem.append(reports) problems.extend(problem) # Unique threads form their own unique problems for thread in unique_func_threads: problems.append({report_map[thread]}) self.log_info("Creating problems from clusters") if speedup: for problem in problems: if not problem: continue first_report = next(iter(problem)) if len(problem) > 1: # Find assigned report origin_report = None for db_report in problem: if db_report.problem_id: origin_report = db_report # Problem created only from new reports comps = {} if not origin_report: new = Problem() db.session.add(new) db.session.flush() first_occurrence = first_report.first_occurrence last_occurrence = first_report.last_occurrence for rep in problem: rep.problem_id = new.id if first_occurrence > rep.first_occurrence: first_occurrence = rep.first_occurrence if last_occurrence < rep.last_occurrence: last_occurrence = rep.last_occurrence if rep.component not in comps: comps[rep.component] = 0 comps[rep.component] += 1 self.update_comps(db, comps, new) new.last_occurrence = last_occurrence new.first_occurrence = first_occurrence else: first_occurrence = origin_report.first_occurrence last_occurrence = origin_report.last_occurrence for rep in problem: if not rep.problem_id: rep.problem_id = origin_report.problem_id if first_occurrence > rep.first_occurrence: first_occurrence = rep.first_occurrence if last_occurrence < rep.last_occurrence: last_occurrence = rep.last_occurrence if rep.component not in comps: comps[rep.component] = 0 comps[rep.component] += 1 orig_p = get_problem_by_id(db, origin_report.problem_id) self.update_comps(db, comps, orig_p) orig_p.last_occurrence = last_occurrence orig_p.first_occurrence = first_occurrence else: # The report is assigned if first_report.problem_id: continue else: # One report that wasn't matched with anything else new = Problem() new.first_occurrence = first_report.first_occurrence new.last_occurrence = first_report.last_occurrence db.session.add(new) db.session.flush() self.update_comps(db, {first_report.component: 1}, new) first_report.problem_id = new.id db.session.flush() else: for problem, db_problem, reports_changed in self._iter_problems( db, problems, db_problems, problems_dict, reuse_problems): comps = {} problem_last_occurrence = None problem_first_occurrence = None for db_report in problem: db_report.problem = db_problem if (problem_last_occurrence is None or problem_last_occurrence < db_report.last_occurrence): problem_last_occurrence = db_report.last_occurrence if (problem_first_occurrence is None or problem_first_occurrence > db_report.first_occurrence): problem_first_occurrence = db_report.first_occurrence if db_report.component not in comps: comps[db_report.component] = 0 comps[db_report.component] += 1 # In case nothing changed, we don't want to mark db_problem # dirty which would cause another UPDATE if db_problem.first_occurrence != problem_first_occurrence: db_problem.first_occurrence = problem_first_occurrence if db_problem.last_occurrence != problem_last_occurrence: db_problem.last_occurrence = problem_last_occurrence if reports_changed: self.update_comps(db, comps, db_problem) self.log_debug("Removing {0} invalid reports from problems" .format(len(invalid_report_ids_to_clean))) for report_id in invalid_report_ids_to_clean: db_report = get_report_by_id(db, report_id) if db_report is not None: db_report.problem_id = None db.session.add(db_report) if report_min_count > 0: self.log_debug("Removing problems from low count reports") remove_problem_from_low_count_reports_by_type(db, problemplugin.name, min_count=report_min_count) self.log_debug("Flushing session") db.session.flush()
def _iter_problems(self, db, problems, db_problems, problems_dict, reuse_problems): """ Yields (problem, db_problem, reports_changed) tuples. """ # Three phases, see below # Counts for statistics i = 0 lookedup_count = 0 found_count = 0 created_count = 0 # List of problems left for the second phase second_pass = list() # List of possible matches for the second phase match_list = list() # Set of db_problems that were used in on of the phases. A db_problem # must be yielded at most once. db_problems_used = set() # Phase one: try to look up precise matches for problem in problems: i += 1 self.log_debug("[{0} / {1}] Processing cluster" .format(i, len(problems))) reports_changed = True problem_id = reuse_problems.get( tuple(sorted([db_report.id for db_report in problem])), None) if problem_id is not None: db_problem = problems_dict.get(problem_id, None) reports_changed = False lookedup_count += 1 self.log_debug("Looked up existing problem #{0}" .format(db_problem.id)) else: matches = self._find_problem_matches(db_problems, problem) if not matches: # No possible match found, must be a new problem db_problem = Problem() db.session.add(db_problem) created_count += 1 else: # Leave the problems for the second phase match_list += matches second_pass.append(problem) continue db_problems_used.add(db_problem) yield (problem, db_problem, reports_changed) # Phase two: yield problems in order of best match self.log_debug("Matching existing problems") self.log_debug("{0} possible matches".format(len(match_list))) for match_metric, problem, db_problem in sorted(match_list, key=itemgetter(0), reverse=True): if problem not in second_pass: self.log_debug("Already matched") continue if db_problem in db_problems_used: self.log_debug("Problem already used") continue found_count += 1 second_pass.remove(problem) db_problems_used.add(db_problem) self.log_debug("Found existing problem #{0} ({1:.2f})" .format(db_problem.id, match_metric)) yield (problem, db_problem, True) # Phase three: create new problems if no match was found above self.log_debug("Processing {0} leftover problems" .format(len(second_pass))) for problem in second_pass: self.log_debug("Creating problem") db_problem = Problem() db.session.add(db_problem) created_count += 1 yield (problem, db_problem, True) self.log_debug("Total: {0} Looked up: {1} Found: {2} Created: {3}" .format(i, lookedup_count, found_count, created_count))
def _create_problems( self, db, problemplugin, #pylint: disable=too-many-statements report_min_count=0, speedup=False): if speedup: self.log_debug("[%s] Getting reports for problems", problemplugin.name) db_reports = get_reports_for_problems(db, problemplugin.name) self.log_debug("[%s] Getting unassigned reports", problemplugin.name) db_reports += get_unassigned_reports(db, problemplugin.name, min_count=report_min_count) else: db_reports = get_reports_by_type(db, problemplugin.name, min_count=report_min_count) db_problems = get_problems(db) # dict to get db_problem by problem_id self.log_debug("Creating problem reuse dict") problems_dict = {} for db_problem in db_problems: problems_dict[db_problem.id] = db_problem # dict to get report_ids by problem_id problem_report = defaultdict(list) for db_report in db_reports: if db_report.problem_id is not None: problem_report[db_report.problem_id].append(db_report.id) # create lookup dict for problems reuse_problems = {} for (problem_id, report_ids) in problem_report.items(): reuse_problems[tuple(sorted(report_ids))] = problem_id invalid_report_ids_to_clean = [] problems = [] if not db_reports: self.log_info("No reports found") elif len(db_reports) == 1: db_report = db_reports[0] if db_report.problem is None: problems.append([db_report]) else: report_map = {} _satyr_reports = [] db_reports_len = len(db_reports) n_processed = 1 # split the work to multiple workers with ThreadPoolExecutor(self._max_workers) as executor: # schedule db_reports for processing futures = { executor.submit(problemplugin.db_report_to_satyr, report): report for report in db_reports } for future in as_completed(futures): db_report = futures.pop(future) self.log_debug("[%d / %d] Loading report #%d", n_processed, db_reports_len, db_report.id) _satyr_report = future.result() if _satyr_report is None: self.log_debug("Unable to create satyr report") if db_report.problem_id is not None: invalid_report_ids_to_clean.append(db_report.id) else: _satyr_reports.append(_satyr_report) report_map[_satyr_report] = db_report n_processed += 1 db.session.expire_all() self.log_debug("Clustering") clusters = self._create_clusters(_satyr_reports, 2000) # Threads that share no function with another thread unique_func_threads = set(_satyr_reports) - set().union(*clusters) dendrograms = [] clusters_len = len(clusters) for i, cluster in enumerate(clusters, start=1): self.log_debug("[%d / %d] Computing distances", i, clusters_len) distances = satyr.Distances(cluster, len(cluster)) self.log_debug("Getting dendrogram") dendrograms.append(satyr.Dendrogram(distances)) dendogram_cut = 0.3 if speedup: dendogram_cut = dendogram_cut * 1.1 for dendrogram, cluster in zip(dendrograms, clusters): problem = [] for dups in dendrogram.cut(dendogram_cut, 1): reports = set(report_map[cluster[dup]] for dup in dups) problem.append(reports) problems.extend(problem) # Unique threads form their own unique problems for thread in unique_func_threads: problems.append({report_map[thread]}) self.log_info("Creating problems from clusters") if speedup: for problem in problems: if not problem: continue first_report = next(iter(problem)) if len(problem) > 1: # Find assigned report origin_report = None for db_report in problem: if db_report.problem_id: origin_report = db_report # Problem created only from new reports comps = {} if not origin_report: new = Problem() db.session.add(new) db.session.flush() first_occurrence = first_report.first_occurrence last_occurrence = first_report.last_occurrence for rep in problem: rep.problem_id = new.id if first_occurrence > rep.first_occurrence: first_occurrence = rep.first_occurrence if last_occurrence < rep.last_occurrence: last_occurrence = rep.last_occurrence if rep.component not in comps: comps[rep.component] = 0 comps[rep.component] += 1 self.update_comps(db, comps, new) new.last_occurrence = last_occurrence new.first_occurrence = first_occurrence else: first_occurrence = origin_report.first_occurrence last_occurrence = origin_report.last_occurrence for rep in problem: if not rep.problem_id: rep.problem_id = origin_report.problem_id if first_occurrence > rep.first_occurrence: first_occurrence = rep.first_occurrence if last_occurrence < rep.last_occurrence: last_occurrence = rep.last_occurrence if rep.component not in comps: comps[rep.component] = 0 comps[rep.component] += 1 orig_p = get_problem_by_id(db, origin_report.problem_id) self.update_comps(db, comps, orig_p) orig_p.last_occurrence = last_occurrence orig_p.first_occurrence = first_occurrence else: # The report is assigned if first_report.problem_id: continue # One report that wasn't matched with anything else new = Problem() new.first_occurrence = first_report.first_occurrence new.last_occurrence = first_report.last_occurrence db.session.add(new) db.session.flush() self.update_comps(db, {first_report.component: 1}, new) first_report.problem_id = new.id db.session.flush() else: for problem, db_problem, reports_changed in self._iter_problems( db, problems, db_problems, problems_dict, reuse_problems): comps = {} problem_last_occurrence = None problem_first_occurrence = None for db_report in problem: db_report.problem = db_problem if (problem_last_occurrence is None or problem_last_occurrence < db_report.last_occurrence): problem_last_occurrence = db_report.last_occurrence if (problem_first_occurrence is None or problem_first_occurrence > db_report.first_occurrence): problem_first_occurrence = db_report.first_occurrence if db_report.component not in comps: comps[db_report.component] = 0 comps[db_report.component] += 1 # In case nothing changed, we don't want to mark db_problem # dirty which would cause another UPDATE if db_problem.first_occurrence != problem_first_occurrence: db_problem.first_occurrence = problem_first_occurrence if db_problem.last_occurrence != problem_last_occurrence: db_problem.last_occurrence = problem_last_occurrence if reports_changed: self.update_comps(db, comps, db_problem) self.log_debug("Removing %d invalid reports from problems", len(invalid_report_ids_to_clean)) unassign_reports(db, invalid_report_ids_to_clean) if report_min_count > 0: self.log_debug("Removing problems from low count reports") remove_problem_from_low_count_reports_by_type( db, problemplugin.name, min_count=report_min_count) self.log_debug("Flushing session") db.session.flush()
def _create_problems(self, db, problemplugin): db_reports = get_reports_by_type(db, problemplugin.name) db_problems = get_problems(db) # dict to get db_problem by problem_id self.log_debug("Creating problem reuse dict") problems_dict = {} for db_problem in db_problems: problems_dict[db_problem.id] = db_problem # dict to get report_ids by problem_id problem_report = defaultdict(list) for db_report in db_reports: if db_report.problem_id is not None: problem_report[db_report.problem_id].append(db_report.id) # create lookup dict for problems reuse_problems = {} for (problem_id, report_ids) in problem_report.items(): reuse_problems[tuple(sorted(report_ids))] = problem_id problems = [] if len(db_reports) < 1: self.log_info("No reports found") elif len(db_reports) == 1: db_report = db_reports[0] if db_report.problem is None: problems.append([db_report]) else: report_map = {} _satyr_reports = [] i = 0 for db_report in db_reports: i += 1 self.log_debug("[{0} / {1}] Loading report #{2}" .format(i, len(db_reports), db_report.id)) _satyr_report = problemplugin._db_report_to_satyr(db_report) if _satyr_report is None: self.log_debug("Unable to create satyr report") else: _satyr_reports.append(_satyr_report) report_map[_satyr_report] = db_report db.session.expire(db_report) self.log_debug("Clustering") clusters = self._create_clusters(_satyr_reports, 2000) unique_func_threads = set(_satyr_reports) - set().union(*clusters) dendrograms = [] i = 0 for cluster in clusters: i += 1 self.log_debug("[{0} / {1}] Computing distances" .format(i, len(clusters))) distances = satyr.Distances(cluster, len(cluster)) self.log_debug("Getting dendrogram") dendrograms.append(satyr.Dendrogram(distances)) for dendrogram, cluster in zip(dendrograms, clusters): problem = [] for dups in dendrogram.cut(0.3, 1): reports = set(report_map[cluster[dup]] for dup in dups) problem.append(reports) problems.extend(problem) for thread in unique_func_threads: problems.append(set([report_map[thread]])) self.log_info("Creating problems") i = 0 lookedup_count = 0 found_count = 0 created_count = 0 for problem in problems: i += 1 self.log_debug("[{0} / {1}] Creating problem" .format(i, len(problems))) comps = {} reports_changed = True problem_id = reuse_problems.get( tuple(sorted([db_report.id for db_report in problem])), None) if problem_id is not None: db_problem = problems_dict.get(problem_id, None) reports_changed = False lookedup_count += 1 self.log_debug("Looked up existing problem #{0}" .format(db_problem.id)) else: db_problem = self._find_problem(db_problems, problem) found_count += 1 if db_problem is None: db_problem = Problem() db.session.add(db_problem) db_problems.append(db_problem) created_count += 1 for db_report in problem: db_report.problem = db_problem if (db_problem.last_occurrence is None or db_problem.last_occurrence < db_report.last_occurrence): db_problem.last_occurrence = db_report.last_occurrence if (db_problem.first_occurrence is None or db_problem.first_occurrence < db_report.first_occurrence): db_problem.first_occurrence = db_report.first_occurrence if db_report.component not in comps: comps[db_report.component] = 0 comps[db_report.component] += 1 if reports_changed: db_comps = sorted(comps, key=lambda x: comps[x], reverse=True) order = 0 for db_component in db_comps: order += 1 db_pcomp = get_problem_component(db, db_problem, db_component) if db_pcomp is None: db_pcomp = ProblemComponent() db_pcomp.problem = db_problem db_pcomp.component = db_component db_pcomp.order = order db.session.add(db_pcomp) self.log_debug("Total: {0} Looked up: {1} Found: {2} Created: {3}" .format(i, lookedup_count, found_count, created_count)) self.log_debug("Flushing session") db.session.flush()