def generate_quality_report(language: str, report: Mapping[str, Any], ptr: ReferencePointer, vnodes: Sequence[VirtualNode], max_files: int, name: str) -> str: """Generate report: classification report, confusion matrix, files with most errors.""" avg_keys = {"macro avg", "micro avg", "weighted avg"} sorted_report = OrderedDict((key, report["report"][key]) for key in sorted( report["report"], key=lambda k: -report["report"][k]["support"]) if key not in avg_keys) for key in avg_keys: sorted_report[key] = report["report"][key] # sort files by mispredictions file_mispred = [] for vnode in vnodes: if vnode.y != getattr(vnode, "y_old", vnode.y): file_mispred.append(vnode.path) file_stat = Counter(file_mispred) to_show = file_stat.most_common() if max_files > 0: to_show = to_show[:max_files] template = load_jinja2_template( os.path.join(TEMPLATES_ROOT, "quality_report.md.jinja2")) # TODO(vmarkovtsev): move all the logic inside the template res = template.render(language=language, ptr=ptr, conf_mat=report["confusion_matrix"], target_names=report["target_names"], files=to_show, cl_report=sorted_report, ppcr=report["ppcr"], cl_report_full=report["report_full"], name=name) return res
def generate_model_report( model: FormatModel, analyze_config: Dict[str, Any], languages: Optional[Union[str, Iterable[str]]] = None) -> str: """ Generate report about model - description for each rule, min/max support, min/max confidence. :param model: trained format model. :param analyze_config: config that is used at the analysis stage. It is needed to calculate \ the real number of enabled rules. :param languages: Languages for which report should be created. You can specify one \ language as string, several as list of strings or None for all languages in \ the model. :return: report in str format. """ languages = languages if languages is not None else model.languages languages = languages if isinstance(languages, Iterable) else [languages] for language in languages: if language not in model: raise NotFittedError(language) template = load_jinja2_template( os.path.join(TEMPLATES_ROOT, "model_report.md.jinja2")) return template.render(model=model, languages=languages, analyze_config=analyze_config, FeatureExtractor=FeatureExtractor, describe_rule=describe_rule)
def _finalize( self, reports: Iterable[Dict[str, str]]) -> Iterator[Dict[str, str]]: """ Summarize all individual reports. :param reports: Reports generated by `TypoCommitsReporter.generate_commit_dataset_report()` :return: Summarized final report """ scores = self.get_metrics_stub() reports = list(reports) for report in reports: scores += pandas.Series(json.loads(report["report"])) scores.detection_precision = scores.detection_true_positive / ( scores.detection_true_positive + scores.detection_false_positive) scores.detection_recall = scores.detection_true_positive / ( scores.detection_true_positive + scores.detection_false_negatives) scores.fix_accuracy = scores.fix_accuracy / len(reports) scores.top3_fix_accuracy = scores.top3_fix_accuracy / len(reports) scores.review_time = scores.review_time / len(reports) template = load_jinja2_template(self.report_template_path) report = template.render(scores=scores, commit=self._get_commit(), package_version=self._get_package_version(), failures=self._failures, tabulate=tabulate) yield {"report": report}
def generate_report( data: pandas.DataFrame, suggestions: Dict[int, List[Tuple[str, float]]], ) -> str: """Print scores for suggestions in an easy readable way.""" template = load_jinja2_template( os.path.join(TEMPLATE_DIR, "scores.md.jinja2")) return template.render(ScoreMode=ScoreMode, get_scores=get_scores, **locals())
def evaluate_typos_on_identifiers( dataset: str = TYPOS_DATASET, config: Optional[Mapping[str, Any]] = None, mistakes_output: Optional[str] = None) -> str: """ Run IdTyposAnalyzer on the identifiers from the evaluation dataset. :param dataset: Dataset of misspelled identifiers. :param config: Configuration for the IdTyposAnalyzer. :param mistakes_output: Path to the file for printing the wrong corrections. :return: Quality report. """ identifiers = pandas.read_csv(dataset, header=0, usecols=[0, 1], names=["wrong", "correct"], keep_default_na=False) analyzer = IdTyposAnalyzer(IdTyposModel(), "", {} if config is None else config) suggestions = analyzer.check_identifiers(identifiers["wrong"].tolist()) corrections = [] for i, identifier in enumerate(identifiers["wrong"]): candidates = list( analyzer.generate_identifier_suggestions(suggestions[i], identifier)) corrections.append(candidates if len(candidates) > 0 else [Candidate(identifier, 1.0)]) for pos in range(analyzer.config["n_candidates"]): identifiers["sugg " + str(pos)] = [ correction[pos][0] if pos < len(correction) else "" for correction in corrections ] if mistakes_output is not None: identifiers[identifiers["sugg 0"] != identifiers["correct"]][[ "wrong", "sugg 0", "correct" ]].to_csv(mistakes_output) template = load_jinja2_template( os.path.join(TEMPLATE_DIR, "quality_on_identifiers.md.jinja2")) return template.render( identifiers=identifiers, suggestions=suggestions, vocabulary_tokens=analyzer.corrector.generator.tokens, n_candidates=analyzer.config["n_candidates"], IDENTIFIER_INDEX_COLUMN=IDENTIFIER_INDEX_COLUMN, Candidate=Candidate, Columns=Columns, tokenize=lambda x: list(analyzer.parser.split(x)), flatten_df_by_column=flatten_df_by_column, generate_report=generate_report)
def __init__(self, model: IdTyposModel, url: str, config: Mapping[str, Any]): """ Initialize a new instance of IdTyposAnalyzer. :param model: The instance of the model loaded from the repository or freshly trained. :param url: The analyzed project's Git remote. :param config: Configuration of the analyzer of unspecified structure. """ super().__init__(model, url, config) self.config = self._load_config(config) self.corrector = self.corrector_manager.get(self.config["corrector"]) self.parser = self.create_token_parser() self.comment_template = load_jinja2_template( self.config["comment_template"]) for identifier in model.identifiers: self.corrector.expand_vocabulary(set( self.parser.split(identifier))) self.allowed_identifiers = set() if self.config["check_all_identifiers"] else \ model.identifiers
def _finalize( self, reports: Iterable[Dict[str, str]]) -> Iterator[Dict[str, str]]: """ Summarize all individual reports. :param reports: Reports generated by `TypoCommitsReporter.generate_commit_dataset_report()` :return: Summarized final report """ def format_series(series: pandas.Series, formatting: Tuple[str, str]) -> pandas.Series: series = series.copy() keys = [] for key, fmt in formatting: keys.append(key) series.loc[key] = ("%" + fmt) % scores[key] return series[keys] scores = self.get_metrics_stub() reports = list(reports) for report in reports: scores += pandas.Series(json.loads(report["report"])) scores.detection_precision = scores.detection_true_positive / ( scores.detection_true_positive + scores.detection_false_positive) scores.detection_recall = scores.detection_true_positive / ( scores.detection_true_positive + scores.detection_false_negatives) scores.fix_accuracy = scores.fix_accuracy / scores.detection_true_positive scores.top3_fix_accuracy = scores.top3_fix_accuracy / scores.detection_true_positive scores.review_time = scores.review_time / len(reports) self._log.info("final scores are\n%s", repr(scores)) template = load_jinja2_template(self.report_template_path) report = template.render(scores=scores, commit=self._get_commit(), package_version=self._get_package_version(), failures=self._failures, tabulate=tabulate, format_series=format_series) yield {"report": report}