def run(self, cmdline, db): """ Mark a problem probably fixed if there is a new build of the problem's affected package, for which no crash reports have come in. """ try: tasks = self._get_tasks(cmdline, db) except FafError as ex: self.log_error( "Unable to process command line arguments: {0}".format( str(ex))) return 1 problems = get_problems(db) task_i = 0 for osplugin, db_release in tasks: task_i += 1 self.log_info("[{0} / {1}] Processing '{2} {3}'".format( task_i, len(tasks), osplugin.nice_name, db_release.version)) self.log_debug("Getting builds...") opsys_builds = osplugin.get_released_builds(db_release.version) newest_builds = {} all_builds = {} now = datetime.now() for build in opsys_builds: age = now - build["completion_time"] # If a hot new build comes out, we need to wait a certain # period of time for people to use it before we can make # conclusions about it being a probable fix. if age.days >= osplugin.build_aging_days: if build["name"] not in newest_builds: newest_builds[build["name"]] = build if build["name"] not in all_builds: all_builds[build["name"]] = [ build, ] else: all_builds[build["name"]].append(build) probably_fixed_total = 0 problems_in_release = 0 problem_counter = 0 for problem in problems: problem_counter += 1 self.log_debug("Processing problem ID:{0} {1}/{2}:".format( problem.id, problem_counter, len(problems))) affected_newest = {} affected_not_found = False reports_for_release = \ get_reports_for_opsysrelease(db, problem.id, db_release.id) # For all the reports, we need the affected packages and their # newest versions. if reports_for_release: problems_in_release += 1 else: self.log_debug( " This problem doesn't appear in this release.") self._save_probable_fix(db, problem, db_release, None) # Next problem continue for report in reports_for_release: # First we try to find the affected package among the known # packages. affected_known = [ (affected.build.base_package_name, affected.build.epoch, affected.build.version, affected.build.release) for affected in get_crashed_package_for_report( db, report.id) ] # Then among the unknown packages. affected_unknown = \ get_crashed_unknown_package_nevr_for_report(db, report.id) # We get the base package name directly from the report affected_unknown = [(report.component.name, affected[1], affected[2], affected[3]) for affected in affected_unknown] affected_all = affected_known + affected_unknown if not affected_all: affected_not_found = True break for affected in affected_all: if affected[0] in affected_newest: # If a problem contains multiple reports with the same # affected package, we only want the newest version of # it. affected_newest[affected[0]]['reports'].append( report) if cmp_evr( affected[1:], affected_newest[affected[0]] ['nevr'][1:]) > 0: affected_newest[affected[0]]['nevr'] = affected else: affected_newest[affected[0]] = { 'reports': [ report, ], 'nevr': affected } if affected_not_found or not affected_newest: # Affected package of one of the reports was not found. # We can't make any conclusions. self.log_debug(" Affected package not found.") self._save_probable_fix(db, problem, db_release, None) # Next problem continue if len(affected_newest) > 1: # Multiple different affected packages => cannot be fixed # by a single package update self.log_debug( " Multiple affected packages. No simple fix.") self._save_probable_fix(db, problem, db_release, None) # Next problem continue probably_fixed_since = datetime.fromtimestamp(0) pkg = list(affected_newest.values())[0] name = pkg['nevr'][0] newest_build = newest_builds.get(name, False) if newest_build: newest_evr = (newest_build["epoch"] or 0, newest_build["version"], newest_build["release"]) if newest_build and cmp_evr(newest_evr, pkg['nevr'][1:]) > 0: # Newest available build is newer than the newest version # of the affected package. Now find the oldest such # probable fix. i = 0 while i < len(all_builds[name]) and cmp_evr( (all_builds[name][i]["epoch"] or 0, all_builds[name][i]["version"], all_builds[name][i]["release"]), pkg['nevr'][1:]) > 0: i += 1 completion_time = all_builds[name][i - 1]["completion_time"] probably_fixed_since = max(completion_time, probably_fixed_since) pkg["probable_fix"] = (name, all_builds[name][i - 1]["epoch"] or 0, all_builds[name][i - 1]["version"], all_builds[name][i - 1]["release"]) self._save_probable_fix(db, problem, db_release, pkg["probable_fix"], probably_fixed_since) self.log_debug(" Probably fixed for {0} days.".format( (datetime.now() - probably_fixed_since).days)) probably_fixed_total += 1 else: self._save_probable_fix(db, problem, db_release, None) self.log_debug(" Not fixed.") db.session.flush() if problems_in_release > 0: self.log_info( "{0}% of problems in this release probably fixed.".format( (probably_fixed_total * 100) // problems_in_release)) else: self.log_info("No problems found in this release.") return 0
def _create_problems(self, db, problemplugin, report_min_count=0, speedup=False): if speedup: db_reports = get_reports_for_problems(db, problemplugin.name) db_reports += get_unassigned_reports(db, problemplugin.name, min_count=report_min_count) else: db_reports = get_reports_by_type(db, problemplugin.name, min_count=report_min_count) db_problems = get_problems(db) # dict to get db_problem by problem_id self.log_debug("Creating problem reuse dict") problems_dict = {} for db_problem in db_problems: problems_dict[db_problem.id] = db_problem # dict to get report_ids by problem_id problem_report = defaultdict(list) for db_report in db_reports: if db_report.problem_id is not None: problem_report[db_report.problem_id].append(db_report.id) # create lookup dict for problems reuse_problems = {} for (problem_id, report_ids) in problem_report.items(): reuse_problems[tuple(sorted(report_ids))] = problem_id invalid_report_ids_to_clean = [] problems = [] if not db_reports: self.log_info("No reports found") elif len(db_reports) == 1: db_report = db_reports[0] if db_report.problem is None: problems.append([db_report]) else: report_map = {} _satyr_reports = [] i = 0 for db_report in db_reports: i += 1 self.log_debug("[{0} / {1}] Loading report #{2}" .format(i, len(db_reports), db_report.id)) _satyr_report = problemplugin._db_report_to_satyr(db_report) if _satyr_report is None: self.log_debug("Unable to create satyr report") if db_report.problem_id is not None: invalid_report_ids_to_clean.append(db_report.id) else: _satyr_reports.append(_satyr_report) report_map[_satyr_report] = db_report db.session.expire(db_report) self.log_debug("Clustering") clusters = self._create_clusters(_satyr_reports, 2000) # Threads that share no function with another thread unique_func_threads = set(_satyr_reports) - set().union(*clusters) dendrograms = [] i = 0 for cluster in clusters: i += 1 self.log_debug("[{0} / {1}] Computing distances" .format(i, len(clusters))) distances = satyr.Distances(cluster, len(cluster)) self.log_debug("Getting dendrogram") dendrograms.append(satyr.Dendrogram(distances)) for dendrogram, cluster in zip(dendrograms, clusters): problem = [] for dups in dendrogram.cut(0.3, 1): reports = set(report_map[cluster[dup]] for dup in dups) problem.append(reports) problems.extend(problem) # Unique threads form their own unique problems for thread in unique_func_threads: problems.append({report_map[thread]}) self.log_info("Creating problems from clusters") if speedup: for problem in problems: if not problem: continue first_report = next(iter(problem)) if len(problem) > 1: # Find assigned report origin_report = None for db_report in problem: if db_report.problem_id: origin_report = db_report # Problem created only from new reports comps = {} if not origin_report: new = Problem() db.session.add(new) db.session.flush() first_occurrence = first_report.first_occurrence last_occurrence = first_report.last_occurrence for rep in problem: rep.problem_id = new.id if first_occurrence > rep.first_occurrence: first_occurrence = rep.first_occurrence if last_occurrence < rep.last_occurrence: last_occurrence = rep.last_occurrence if rep.component not in comps: comps[rep.component] = 0 comps[rep.component] += 1 self.update_comps(db, comps, new) new.last_occurrence = last_occurrence new.first_occurrence = first_occurrence else: first_occurrence = origin_report.first_occurrence last_occurrence = origin_report.last_occurrence for rep in problem: if not rep.problem_id: rep.problem_id = origin_report.problem_id if first_occurrence > rep.first_occurrence: first_occurrence = rep.first_occurrence if last_occurrence < rep.last_occurrence: last_occurrence = rep.last_occurrence if rep.component not in comps: comps[rep.component] = 0 comps[rep.component] += 1 orig_p = get_problem_by_id(db, origin_report.problem_id) self.update_comps(db, comps, orig_p) orig_p.last_occurrence = last_occurrence orig_p.first_occurrence = first_occurrence else: # The report is assigned if first_report.problem_id: continue else: # One report that wasn't matched with anything else new = Problem() new.first_occurrence = first_report.first_occurrence new.last_occurrence = first_report.last_occurrence db.session.add(new) db.session.flush() self.update_comps(db, {first_report.component: 1}, new) first_report.problem_id = new.id db.session.flush() else: for problem, db_problem, reports_changed in self._iter_problems( db, problems, db_problems, problems_dict, reuse_problems): comps = {} problem_last_occurrence = None problem_first_occurrence = None for db_report in problem: db_report.problem = db_problem if (problem_last_occurrence is None or problem_last_occurrence < db_report.last_occurrence): problem_last_occurrence = db_report.last_occurrence if (problem_first_occurrence is None or problem_first_occurrence > db_report.first_occurrence): problem_first_occurrence = db_report.first_occurrence if db_report.component not in comps: comps[db_report.component] = 0 comps[db_report.component] += 1 # In case nothing changed, we don't want to mark db_problem # dirty which would cause another UPDATE if db_problem.first_occurrence != problem_first_occurrence: db_problem.first_occurrence = problem_first_occurrence if db_problem.last_occurrence != problem_last_occurrence: db_problem.last_occurrence = problem_last_occurrence if reports_changed: self.update_comps(db, comps, db_problem) self.log_debug("Removing {0} invalid reports from problems" .format(len(invalid_report_ids_to_clean))) for report_id in invalid_report_ids_to_clean: db_report = get_report_by_id(db, report_id) if db_report is not None: db_report.problem_id = None db.session.add(db_report) if report_min_count > 0: self.log_debug("Removing problems from low count reports") remove_problem_from_low_count_reports_by_type(db, problemplugin.name, min_count=report_min_count) self.log_debug("Flushing session") db.session.flush()
def run(self, cmdline, db): """ Mark a problem probably fixed if there is a new build of the problem's affected package, for which no crash reports have come in. """ try: tasks = self._get_tasks(cmdline, db) except FafError as ex: self.log_error("Unable to process command line arguments: {0}" .format(str(ex))) return 1 problems = get_problems(db) task_i = 0 for osplugin, db_release in tasks: task_i += 1 self.log_info("[{0} / {1}] Processing '{2} {3}'" .format(task_i, len(tasks), osplugin.nice_name, db_release.version)) self.log_debug("Getting builds...") opsys_builds = osplugin.get_released_builds(db_release.version) newest_builds = {} all_builds = {} now = datetime.now() for build in opsys_builds: age = now - build["completion_time"] # If a hot new build comes out, we need to wait a certain # period of time for people to use it before we can make # conclusions about it being a probable fix. if age.days >= osplugin.build_aging_days: if build["name"] not in newest_builds: newest_builds[build["name"]] = build if build["name"] not in all_builds: all_builds[build["name"]] = [build, ] else: all_builds[build["name"]].append(build) probably_fixed_total = 0 problems_in_release = 0 problem_counter = 0 for problem in problems: problem_counter += 1 self.log_debug("Processing problem ID:{0} {1}/{2}:" .format(problem.id, problem_counter, len(problems))) affected_newest = {} affected_not_found = False reports_for_release = \ get_reports_for_opsysrelease(db, problem.id, db_release.id) # For all the reports, we need the affected packages and their # newest versions. if reports_for_release: problems_in_release += 1 else: self.log_debug(" This problem doesn't appear in this release.") self._save_probable_fix(db, problem, db_release, None) # Next problem continue for report in reports_for_release: # First we try to find the affected package among the known # packages. affected_known = [ (affected.build.base_package_name, affected.build.epoch, affected.build.version, affected.build.release) for affected in get_crashed_package_for_report(db, report.id)] # Then among the unknown packages. affected_unknown = \ get_crashed_unknown_package_nevr_for_report(db, report.id) # We get the base package name directly from the report affected_unknown = [(report.component.name, affected[1], affected[2], affected[3]) for affected in affected_unknown] affected_all = affected_known + affected_unknown if not affected_all: affected_not_found = True break for affected in affected_all: if affected[0] in affected_newest: # If a problem contains multiple reports with the same # affected package, we only want the newest version of # it. affected_newest[affected[0]]['reports'].append(report) if cmp_evr(affected[1:], affected_newest[affected[0]]['nevr'][1:]) > 0: affected_newest[affected[0]]['nevr'] = affected else: affected_newest[affected[0]] = { 'reports': [report, ], 'nevr': affected } if affected_not_found or not affected_newest: # Affected package of one of the reports was not found. # We can't make any conclusions. self.log_debug(" Affected package not found.") self._save_probable_fix(db, problem, db_release, None) # Next problem continue if len(affected_newest) > 1: # Multiple different affected packages => cannot be fixed # by a single package update self.log_debug(" Multiple affected packages. No simple fix.") self._save_probable_fix(db, problem, db_release, None) # Next problem continue probably_fixed_since = datetime.fromtimestamp(0) pkg = list(affected_newest.values())[0] name = pkg['nevr'][0] newest_build = newest_builds.get(name, False) if newest_build: newest_evr = (newest_build["epoch"] or 0, newest_build["version"], newest_build["release"]) if newest_build and cmp_evr(newest_evr, pkg['nevr'][1:]) > 0: # Newest available build is newer than the newest version # of the affected package. Now find the oldest such # probable fix. i = 0 while i < len(all_builds[name]) and cmp_evr( (all_builds[name][i]["epoch"] or 0, all_builds[name][i]["version"], all_builds[name][i]["release"]), pkg['nevr'][1:]) > 0: i += 1 completion_time = all_builds[name][i-1]["completion_time"] probably_fixed_since = max(completion_time, probably_fixed_since) pkg["probable_fix"] = (name, all_builds[name][i-1]["epoch"] or 0, all_builds[name][i-1]["version"], all_builds[name][i-1]["release"]) self._save_probable_fix(db, problem, db_release, pkg["probable_fix"], probably_fixed_since) self.log_debug(" Probably fixed for {0} days.".format( (datetime.now() - probably_fixed_since).days)) probably_fixed_total += 1 else: self._save_probable_fix(db, problem, db_release, None) self.log_debug(" Not fixed.") db.session.flush() if problems_in_release > 0: self.log_info("{0}% of problems in this release probably fixed.".format( (probably_fixed_total * 100) // problems_in_release)) else: self.log_info("No problems found in this release.")
def _create_problems( self, db, problemplugin, #pylint: disable=too-many-statements report_min_count=0, speedup=False): if speedup: self.log_debug("[%s] Getting reports for problems", problemplugin.name) db_reports = get_reports_for_problems(db, problemplugin.name) self.log_debug("[%s] Getting unassigned reports", problemplugin.name) db_reports += get_unassigned_reports(db, problemplugin.name, min_count=report_min_count) else: db_reports = get_reports_by_type(db, problemplugin.name, min_count=report_min_count) db_problems = get_problems(db) # dict to get db_problem by problem_id self.log_debug("Creating problem reuse dict") problems_dict = {} for db_problem in db_problems: problems_dict[db_problem.id] = db_problem # dict to get report_ids by problem_id problem_report = defaultdict(list) for db_report in db_reports: if db_report.problem_id is not None: problem_report[db_report.problem_id].append(db_report.id) # create lookup dict for problems reuse_problems = {} for (problem_id, report_ids) in problem_report.items(): reuse_problems[tuple(sorted(report_ids))] = problem_id invalid_report_ids_to_clean = [] problems = [] if not db_reports: self.log_info("No reports found") elif len(db_reports) == 1: db_report = db_reports[0] if db_report.problem is None: problems.append([db_report]) else: report_map = {} _satyr_reports = [] db_reports_len = len(db_reports) n_processed = 1 # split the work to multiple workers with ThreadPoolExecutor(self._max_workers) as executor: # schedule db_reports for processing futures = { executor.submit(problemplugin.db_report_to_satyr, report): report for report in db_reports } for future in as_completed(futures): db_report = futures.pop(future) self.log_debug("[%d / %d] Loading report #%d", n_processed, db_reports_len, db_report.id) _satyr_report = future.result() if _satyr_report is None: self.log_debug("Unable to create satyr report") if db_report.problem_id is not None: invalid_report_ids_to_clean.append(db_report.id) else: _satyr_reports.append(_satyr_report) report_map[_satyr_report] = db_report n_processed += 1 db.session.expire_all() self.log_debug("Clustering") clusters = self._create_clusters(_satyr_reports, 2000) # Threads that share no function with another thread unique_func_threads = set(_satyr_reports) - set().union(*clusters) dendrograms = [] clusters_len = len(clusters) for i, cluster in enumerate(clusters, start=1): self.log_debug("[%d / %d] Computing distances", i, clusters_len) distances = satyr.Distances(cluster, len(cluster)) self.log_debug("Getting dendrogram") dendrograms.append(satyr.Dendrogram(distances)) dendogram_cut = 0.3 if speedup: dendogram_cut = dendogram_cut * 1.1 for dendrogram, cluster in zip(dendrograms, clusters): problem = [] for dups in dendrogram.cut(dendogram_cut, 1): reports = set(report_map[cluster[dup]] for dup in dups) problem.append(reports) problems.extend(problem) # Unique threads form their own unique problems for thread in unique_func_threads: problems.append({report_map[thread]}) self.log_info("Creating problems from clusters") if speedup: for problem in problems: if not problem: continue first_report = next(iter(problem)) if len(problem) > 1: # Find assigned report origin_report = None for db_report in problem: if db_report.problem_id: origin_report = db_report # Problem created only from new reports comps = {} if not origin_report: new = Problem() db.session.add(new) db.session.flush() first_occurrence = first_report.first_occurrence last_occurrence = first_report.last_occurrence for rep in problem: rep.problem_id = new.id if first_occurrence > rep.first_occurrence: first_occurrence = rep.first_occurrence if last_occurrence < rep.last_occurrence: last_occurrence = rep.last_occurrence if rep.component not in comps: comps[rep.component] = 0 comps[rep.component] += 1 self.update_comps(db, comps, new) new.last_occurrence = last_occurrence new.first_occurrence = first_occurrence else: first_occurrence = origin_report.first_occurrence last_occurrence = origin_report.last_occurrence for rep in problem: if not rep.problem_id: rep.problem_id = origin_report.problem_id if first_occurrence > rep.first_occurrence: first_occurrence = rep.first_occurrence if last_occurrence < rep.last_occurrence: last_occurrence = rep.last_occurrence if rep.component not in comps: comps[rep.component] = 0 comps[rep.component] += 1 orig_p = get_problem_by_id(db, origin_report.problem_id) self.update_comps(db, comps, orig_p) orig_p.last_occurrence = last_occurrence orig_p.first_occurrence = first_occurrence else: # The report is assigned if first_report.problem_id: continue # One report that wasn't matched with anything else new = Problem() new.first_occurrence = first_report.first_occurrence new.last_occurrence = first_report.last_occurrence db.session.add(new) db.session.flush() self.update_comps(db, {first_report.component: 1}, new) first_report.problem_id = new.id db.session.flush() else: for problem, db_problem, reports_changed in self._iter_problems( db, problems, db_problems, problems_dict, reuse_problems): comps = {} problem_last_occurrence = None problem_first_occurrence = None for db_report in problem: db_report.problem = db_problem if (problem_last_occurrence is None or problem_last_occurrence < db_report.last_occurrence): problem_last_occurrence = db_report.last_occurrence if (problem_first_occurrence is None or problem_first_occurrence > db_report.first_occurrence): problem_first_occurrence = db_report.first_occurrence if db_report.component not in comps: comps[db_report.component] = 0 comps[db_report.component] += 1 # In case nothing changed, we don't want to mark db_problem # dirty which would cause another UPDATE if db_problem.first_occurrence != problem_first_occurrence: db_problem.first_occurrence = problem_first_occurrence if db_problem.last_occurrence != problem_last_occurrence: db_problem.last_occurrence = problem_last_occurrence if reports_changed: self.update_comps(db, comps, db_problem) self.log_debug("Removing %d invalid reports from problems", len(invalid_report_ids_to_clean)) unassign_reports(db, invalid_report_ids_to_clean) if report_min_count > 0: self.log_debug("Removing problems from low count reports") remove_problem_from_low_count_reports_by_type( db, problemplugin.name, min_count=report_min_count) self.log_debug("Flushing session") db.session.flush()
def _create_problems(self, db, problemplugin): db_reports = get_reports_by_type(db, problemplugin.name) db_problems = get_problems(db) # dict to get db_problem by problem_id self.log_debug("Creating problem reuse dict") problems_dict = {} for db_problem in db_problems: problems_dict[db_problem.id] = db_problem # dict to get report_ids by problem_id problem_report = defaultdict(list) for db_report in db_reports: if db_report.problem_id is not None: problem_report[db_report.problem_id].append(db_report.id) # create lookup dict for problems reuse_problems = {} for (problem_id, report_ids) in problem_report.items(): reuse_problems[tuple(sorted(report_ids))] = problem_id problems = [] if len(db_reports) < 1: self.log_info("No reports found") elif len(db_reports) == 1: db_report = db_reports[0] if db_report.problem is None: problems.append([db_report]) else: report_map = {} _satyr_reports = [] i = 0 for db_report in db_reports: i += 1 self.log_debug("[{0} / {1}] Loading report #{2}" .format(i, len(db_reports), db_report.id)) _satyr_report = problemplugin._db_report_to_satyr(db_report) if _satyr_report is None: self.log_debug("Unable to create satyr report") else: _satyr_reports.append(_satyr_report) report_map[_satyr_report] = db_report db.session.expire(db_report) self.log_debug("Clustering") clusters = self._create_clusters(_satyr_reports, 2000) unique_func_threads = set(_satyr_reports) - set().union(*clusters) dendrograms = [] i = 0 for cluster in clusters: i += 1 self.log_debug("[{0} / {1}] Computing distances" .format(i, len(clusters))) distances = satyr.Distances(cluster, len(cluster)) self.log_debug("Getting dendrogram") dendrograms.append(satyr.Dendrogram(distances)) for dendrogram, cluster in zip(dendrograms, clusters): problem = [] for dups in dendrogram.cut(0.3, 1): reports = set(report_map[cluster[dup]] for dup in dups) problem.append(reports) problems.extend(problem) for thread in unique_func_threads: problems.append(set([report_map[thread]])) self.log_info("Creating problems") i = 0 lookedup_count = 0 found_count = 0 created_count = 0 for problem in problems: i += 1 self.log_debug("[{0} / {1}] Creating problem" .format(i, len(problems))) comps = {} reports_changed = True problem_id = reuse_problems.get( tuple(sorted([db_report.id for db_report in problem])), None) if problem_id is not None: db_problem = problems_dict.get(problem_id, None) reports_changed = False lookedup_count += 1 self.log_debug("Looked up existing problem #{0}" .format(db_problem.id)) else: db_problem = self._find_problem(db_problems, problem) found_count += 1 if db_problem is None: db_problem = Problem() db.session.add(db_problem) db_problems.append(db_problem) created_count += 1 for db_report in problem: db_report.problem = db_problem if (db_problem.last_occurrence is None or db_problem.last_occurrence < db_report.last_occurrence): db_problem.last_occurrence = db_report.last_occurrence if (db_problem.first_occurrence is None or db_problem.first_occurrence < db_report.first_occurrence): db_problem.first_occurrence = db_report.first_occurrence if db_report.component not in comps: comps[db_report.component] = 0 comps[db_report.component] += 1 if reports_changed: db_comps = sorted(comps, key=lambda x: comps[x], reverse=True) order = 0 for db_component in db_comps: order += 1 db_pcomp = get_problem_component(db, db_problem, db_component) if db_pcomp is None: db_pcomp = ProblemComponent() db_pcomp.problem = db_problem db_pcomp.component = db_component db_pcomp.order = order db.session.add(db_pcomp) self.log_debug("Total: {0} Looked up: {1} Found: {2} Created: {3}" .format(i, lookedup_count, found_count, created_count)) self.log_debug("Flushing session") db.session.flush()
def _create_problems(self, db, problemplugin): db_reports = get_reports_by_type(db, problemplugin.name) db_problems = get_problems(db) # dict to get db_problem by problem_id self.log_debug("Creating problem reuse dict") problems_dict = {} for db_problem in db_problems: problems_dict[db_problem.id] = db_problem # dict to get report_ids by problem_id problem_report = defaultdict(list) for db_report in db_reports: if db_report.problem_id is not None: problem_report[db_report.problem_id].append(db_report.id) # create lookup dict for problems reuse_problems = {} for (problem_id, report_ids) in problem_report.items(): reuse_problems[tuple(sorted(report_ids))] = problem_id invalid_report_ids_to_clean = [] problems = [] if len(db_reports) < 1: self.log_info("No reports found") elif len(db_reports) == 1: db_report = db_reports[0] if db_report.problem is None: problems.append([db_report]) else: report_map = {} _satyr_reports = [] i = 0 for db_report in db_reports: i += 1 self.log_debug("[{0} / {1}] Loading report #{2}" .format(i, len(db_reports), db_report.id)) _satyr_report = problemplugin._db_report_to_satyr(db_report) if _satyr_report is None: self.log_debug("Unable to create satyr report") if db_report.problem_id is not None: invalid_report_ids_to_clean.append(db_report.id) else: _satyr_reports.append(_satyr_report) report_map[_satyr_report] = db_report db.session.expire(db_report) self.log_debug("Clustering") clusters = self._create_clusters(_satyr_reports, 2000) unique_func_threads = set(_satyr_reports) - set().union(*clusters) dendrograms = [] i = 0 for cluster in clusters: i += 1 self.log_debug("[{0} / {1}] Computing distances" .format(i, len(clusters))) distances = satyr.Distances(cluster, len(cluster)) self.log_debug("Getting dendrogram") dendrograms.append(satyr.Dendrogram(distances)) for dendrogram, cluster in zip(dendrograms, clusters): problem = [] for dups in dendrogram.cut(0.3, 1): reports = set(report_map[cluster[dup]] for dup in dups) problem.append(reports) problems.extend(problem) for thread in unique_func_threads: problems.append(set([report_map[thread]])) self.log_info("Creating problems from clusters") for problem, db_problem, reports_changed in self._iter_problems( db, problems, db_problems, problems_dict, reuse_problems): comps = {} problem_last_occurrence = None problem_first_occurrence = None for db_report in problem: db_report.problem = db_problem if (problem_last_occurrence is None or problem_last_occurrence < db_report.last_occurrence): problem_last_occurrence = db_report.last_occurrence if (problem_first_occurrence is None or problem_first_occurrence > db_report.first_occurrence): problem_first_occurrence = db_report.first_occurrence if db_report.component not in comps: comps[db_report.component] = 0 comps[db_report.component] += 1 # In case nothing changed, we don't want to mark db_problem dirty # which would cause another UPDATE if db_problem.first_occurrence != problem_first_occurrence: db_problem.first_occurrence = problem_first_occurrence if db_problem.last_occurrence != problem_last_occurrence: db_problem.last_occurrence = problem_last_occurrence if reports_changed: db_comps = sorted(comps, key=lambda x: comps[x], reverse=True) order = 0 for db_component in db_comps: order += 1 db_pcomp = get_problem_component(db, db_problem, db_component) if db_pcomp is None: db_pcomp = ProblemComponent() db_pcomp.problem = db_problem db_pcomp.component = db_component db_pcomp.order = order db.session.add(db_pcomp) self.log_debug("Removing {0} invalid reports from problems" .format(len(invalid_report_ids_to_clean))) for report_id in invalid_report_ids_to_clean: db_report = get_report_by_id(db, report_id) if db_report is not None: db_report.problem_id = None db.session.add(db_report) self.log_debug("Flushing session") db.session.flush()