Ejemplo n.º 1
0
def generate_quality_report(language: str, report: Mapping[str, Any],
                            ptr: ReferencePointer,
                            vnodes: Sequence[VirtualNode], max_files: int,
                            name: str) -> str:
    """Generate report: classification report, confusion matrix, files with most errors."""
    avg_keys = {"macro avg", "micro avg", "weighted avg"}
    sorted_report = OrderedDict((key, report["report"][key]) for key in sorted(
        report["report"], key=lambda k: -report["report"][k]["support"])
                                if key not in avg_keys)
    for key in avg_keys:
        sorted_report[key] = report["report"][key]
    # sort files by mispredictions
    file_mispred = []
    for vnode in vnodes:
        if vnode.y != getattr(vnode, "y_old", vnode.y):
            file_mispred.append(vnode.path)
    file_stat = Counter(file_mispred)
    to_show = file_stat.most_common()
    if max_files > 0:
        to_show = to_show[:max_files]

    template = load_jinja2_template(
        os.path.join(TEMPLATES_ROOT, "quality_report.md.jinja2"))
    # TODO(vmarkovtsev): move all the logic inside the template
    res = template.render(language=language,
                          ptr=ptr,
                          conf_mat=report["confusion_matrix"],
                          target_names=report["target_names"],
                          files=to_show,
                          cl_report=sorted_report,
                          ppcr=report["ppcr"],
                          cl_report_full=report["report_full"],
                          name=name)
    return res
Ejemplo n.º 2
0
def generate_model_report(
        model: FormatModel,
        analyze_config: Dict[str, Any],
        languages: Optional[Union[str, Iterable[str]]] = None) -> str:
    """
    Generate report about model - description for each rule, min/max support, min/max confidence.

    :param model: trained format model.
    :param analyze_config: config that is used at the analysis stage. It is needed to calculate \
                           the real number of enabled rules.
    :param languages: Languages for which report should be created. You can specify one \
                      language as string, several as list of strings or None for all languages in \
                      the model.
    :return: report in str format.
    """
    languages = languages if languages is not None else model.languages
    languages = languages if isinstance(languages, Iterable) else [languages]
    for language in languages:
        if language not in model:
            raise NotFittedError(language)
    template = load_jinja2_template(
        os.path.join(TEMPLATES_ROOT, "model_report.md.jinja2"))
    return template.render(model=model,
                           languages=languages,
                           analyze_config=analyze_config,
                           FeatureExtractor=FeatureExtractor,
                           describe_rule=describe_rule)
Ejemplo n.º 3
0
    def _finalize(
            self, reports: Iterable[Dict[str,
                                         str]]) -> Iterator[Dict[str, str]]:
        """
        Summarize all individual reports.

        :param reports: Reports generated by `TypoCommitsReporter.generate_commit_dataset_report()`
        :return: Summarized final report
        """
        scores = self.get_metrics_stub()
        reports = list(reports)
        for report in reports:
            scores += pandas.Series(json.loads(report["report"]))
        scores.detection_precision = scores.detection_true_positive / (
            scores.detection_true_positive + scores.detection_false_positive)
        scores.detection_recall = scores.detection_true_positive / (
            scores.detection_true_positive + scores.detection_false_negatives)
        scores.fix_accuracy = scores.fix_accuracy / len(reports)
        scores.top3_fix_accuracy = scores.top3_fix_accuracy / len(reports)
        scores.review_time = scores.review_time / len(reports)

        template = load_jinja2_template(self.report_template_path)
        report = template.render(scores=scores,
                                 commit=self._get_commit(),
                                 package_version=self._get_package_version(),
                                 failures=self._failures,
                                 tabulate=tabulate)

        yield {"report": report}
Ejemplo n.º 4
0
def generate_report(
    data: pandas.DataFrame,
    suggestions: Dict[int, List[Tuple[str, float]]],
) -> str:
    """Print scores for suggestions in an easy readable way."""
    template = load_jinja2_template(
        os.path.join(TEMPLATE_DIR, "scores.md.jinja2"))
    return template.render(ScoreMode=ScoreMode,
                           get_scores=get_scores,
                           **locals())
Ejemplo n.º 5
0
def evaluate_typos_on_identifiers(
        dataset: str = TYPOS_DATASET,
        config: Optional[Mapping[str, Any]] = None,
        mistakes_output: Optional[str] = None) -> str:
    """
    Run IdTyposAnalyzer on the identifiers from the evaluation dataset.

    :param dataset: Dataset of misspelled identifiers.
    :param config: Configuration for the IdTyposAnalyzer.
    :param mistakes_output: Path to the file for printing the wrong corrections.
    :return: Quality report.
    """
    identifiers = pandas.read_csv(dataset,
                                  header=0,
                                  usecols=[0, 1],
                                  names=["wrong", "correct"],
                                  keep_default_na=False)
    analyzer = IdTyposAnalyzer(IdTyposModel(), "",
                               {} if config is None else config)
    suggestions = analyzer.check_identifiers(identifiers["wrong"].tolist())
    corrections = []
    for i, identifier in enumerate(identifiers["wrong"]):
        candidates = list(
            analyzer.generate_identifier_suggestions(suggestions[i],
                                                     identifier))
        corrections.append(candidates if len(candidates) > 0 else
                           [Candidate(identifier, 1.0)])

    for pos in range(analyzer.config["n_candidates"]):
        identifiers["sugg " + str(pos)] = [
            correction[pos][0] if pos < len(correction) else ""
            for correction in corrections
        ]
    if mistakes_output is not None:
        identifiers[identifiers["sugg 0"] != identifiers["correct"]][[
            "wrong", "sugg 0", "correct"
        ]].to_csv(mistakes_output)
    template = load_jinja2_template(
        os.path.join(TEMPLATE_DIR, "quality_on_identifiers.md.jinja2"))
    return template.render(
        identifiers=identifiers,
        suggestions=suggestions,
        vocabulary_tokens=analyzer.corrector.generator.tokens,
        n_candidates=analyzer.config["n_candidates"],
        IDENTIFIER_INDEX_COLUMN=IDENTIFIER_INDEX_COLUMN,
        Candidate=Candidate,
        Columns=Columns,
        tokenize=lambda x: list(analyzer.parser.split(x)),
        flatten_df_by_column=flatten_df_by_column,
        generate_report=generate_report)
Ejemplo n.º 6
0
    def __init__(self, model: IdTyposModel, url: str, config: Mapping[str,
                                                                      Any]):
        """
        Initialize a new instance of IdTyposAnalyzer.

        :param model: The instance of the model loaded from the repository or freshly trained.
        :param url: The analyzed project's Git remote.
        :param config: Configuration of the analyzer of unspecified structure.
        """
        super().__init__(model, url, config)
        self.config = self._load_config(config)
        self.corrector = self.corrector_manager.get(self.config["corrector"])
        self.parser = self.create_token_parser()
        self.comment_template = load_jinja2_template(
            self.config["comment_template"])
        for identifier in model.identifiers:
            self.corrector.expand_vocabulary(set(
                self.parser.split(identifier)))
        self.allowed_identifiers = set() if self.config["check_all_identifiers"] else \
            model.identifiers
Ejemplo n.º 7
0
    def _finalize(
            self, reports: Iterable[Dict[str,
                                         str]]) -> Iterator[Dict[str, str]]:
        """
        Summarize all individual reports.

        :param reports: Reports generated by `TypoCommitsReporter.generate_commit_dataset_report()`
        :return: Summarized final report
        """
        def format_series(series: pandas.Series,
                          formatting: Tuple[str, str]) -> pandas.Series:
            series = series.copy()
            keys = []
            for key, fmt in formatting:
                keys.append(key)
                series.loc[key] = ("%" + fmt) % scores[key]
            return series[keys]

        scores = self.get_metrics_stub()
        reports = list(reports)
        for report in reports:
            scores += pandas.Series(json.loads(report["report"]))
        scores.detection_precision = scores.detection_true_positive / (
            scores.detection_true_positive + scores.detection_false_positive)
        scores.detection_recall = scores.detection_true_positive / (
            scores.detection_true_positive + scores.detection_false_negatives)
        scores.fix_accuracy = scores.fix_accuracy / scores.detection_true_positive
        scores.top3_fix_accuracy = scores.top3_fix_accuracy / scores.detection_true_positive
        scores.review_time = scores.review_time / len(reports)
        self._log.info("final scores are\n%s", repr(scores))
        template = load_jinja2_template(self.report_template_path)
        report = template.render(scores=scores,
                                 commit=self._get_commit(),
                                 package_version=self._get_package_version(),
                                 failures=self._failures,
                                 tabulate=tabulate,
                                 format_series=format_series)

        yield {"report": report}