Exemple #1
0
def calc_expected_vnodes_number_entry(input: str, output: str, runs: int) -> None:
    """
    Entry point for `python -m lookout.style.format calc-expected-support` command.

    :param input: сsv file with repositories for quality report. Should contain url, to and from \
                 columns.
    :param output: Path to a output csv file.
    :param runs: Repeat number to ensure the result correctness.
    """
    log = logging.getLogger("expected_vnodes_number")
    handler = logging.handlers.RotatingFileHandler(output + ".errors")
    handler.setLevel(logging.ERROR)
    log.addHandler(handler)
    if not server.exefile.exists():
        server.fetch()  # download executable
    port = server.find_port()
    repositories = list(csv.DictReader(handle_input_arg(input)))
    try:
        bblfsh = _restart_bblfshd(first_run=True)
        for cur_run in range(runs):
            with tempfile.TemporaryDirectory() as tmpdirname:
                database = os.path.join(tmpdirname, "db.sqlite3")
                fs = os.path.join(tmpdirname, "models")
                os.makedirs(fs, exist_ok=fs)
                with AnalyzerContextManager(FormatAnalyzer,  port=port, db=database, fs=fs,
                                            init=False):
                    for row in tqdm(repositories):
                        try:
                            vnodes_number = get_vnodes_number(
                                row["url"], to_commit=row["to"], from_commit=row["from"],
                                port=port, bblfsh=bblfsh)
                            log.info("%d/%d run. Expected vnodes number for %s is %d.",
                                     cur_run + 1, runs, row["url"], vnodes_number)
                            if row.get("vnodes_number", vnodes_number) != vnodes_number:
                                log.warning("vnodes number is different for %d/%d run. Get %d "
                                            "instead of %d. Set to nan.", cur_run + 1, runs,
                                            vnodes_number, row["vnodes_number"])
                                row["vnodes_number"] = float("nan")
                            else:
                                row["vnodes_number"] = vnodes_number
                        except Exception:
                            log.exception("-" * 20 + "\nFailed to process %s repo", row["url"])
                            continue
                        bblfsh = _restart_bblfshd()
    finally:
        _stop_bblfshd()

    fieldnames = ["url", "to", "from", "vnodes_number"]
    with open(output, "w") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for row in repositories:
            writer.writerow(row)
Exemple #2
0
def evaluate_smoke_entry(inputpath: str, reportdir: str, database: str, bblfsh: str, config: dict,
                         ) -> None:
    """
    CLI entry point.
    """
    start_time = time.time()
    report_filename = os.path.join(reportdir, "report.csv")
    log = logging.getLogger("evaluate_smoke")
    port = server.find_port()

    if database is None:
        db = tempfile.NamedTemporaryFile(dir=inputpath, prefix="db", suffix=".sqlite3")
        database = db.name
        log.info("Database %s created" % database)
    else:
        if os.path.exists(database):
            log.info("Found existing database %s" % database)
        else:
            log.info("Database %s not found and will be created." % database)
    with tempfile.TemporaryDirectory(dir=inputpath) as fs:
        with AnalyzerContextManager(SmokeEvalFormatAnalyzer, port=port, db=database, fs=fs):
            inputpath = Path(inputpath)
            if not server.exefile.exists():
                server.fetch()
            index_file = inputpath / "index.csv"
            os.makedirs(reportdir, exist_ok=True)
            with open(report_filename, "w") as report:
                csv.DictWriter(report, fieldnames=SmokeEvalFormatAnalyzer.REPORT_COLNAMES,
                               ).writeheader()
            with open(str(index_file)) as index:
                reader = csv.DictReader(index)
                for row in tqdm(reader):
                    repopath = inputpath / row["repo"]
                    config_json = {
                        SmokeEvalFormatAnalyzer.name:
                            merge_dicts(config, {
                                "style_name": row["style"],
                                "report_path": reportdir,
                            })}
                    server.run("review", fr=row["from"], to=row["to"], port=port,
                               git_dir=str(repopath), log_level="warning", bblfsh=bblfsh,
                               config_json=json.dumps(config_json))
            log.info("Quality report saved to %s", reportdir)

    report = pandas.read_csv(report_filename)
    with pandas.option_context("display.max_columns", 10, "display.expand_frame_repr", False):
        print(report.describe())
    log.info("Time spent: %.3f" % (time.time() - start_time))
Exemple #3
0
    def setUpClass(cls):
        """Prepare environment & train the model for tests."""
        if not server.exefile.exists():
            server.fetch()
        # required config
        cls.bblfsh = "0.0.0.0:9432"
        cls.language = "javascript"

        # analyzer
        parent_loc = Path(__file__).parent.resolve()
        cls.base_dir_ = tempfile.TemporaryDirectory()
        cls.base_dir = cls.base_dir_.name
        cls.port = server.find_port()
        # extract repo
        cls.jquery_dir = os.path.join(cls.base_dir, "jquery")
        # str() is needed for Python 3.5
        with tarfile.open(str(parent_loc / "jquery.tar.xz")) as tar:
            tar.extractall(path=cls.base_dir)
        files = glob.glob(os.path.join(cls.jquery_dir, "**", "*"),
                          recursive=True)
        assert len(files) == 15, len(files)
        cls.model_path = os.path.join(str(parent_loc), "model_jquery.asdf")
    def setUp(self, fs=None):
        self.port = server.find_port()
        self.db = tempfile.NamedTemporaryFile(dir=self.base_dir)
        if fs is None:
            self.fs = tempfile.TemporaryDirectory(dir=self.base_dir)
        else:
            self.fs = fs

        self.analyzer = AnalyzerContextManager(FormatAnalyzer,
                                               port=self.port,
                                               db=self.db.name,
                                               fs=self.fs.name).__enter__()
        self.logs = logs = []

        class ShadowHandler(logging.Handler):
            def emit(self, record):
                logs.append(logging.getLogger().handlers[0].format(record))

        self.log_handler = ShadowHandler()
        logging.getLogger().addHandler(self.log_handler)

        if not os.path.exists(str(server.exefile)):
            server.fetch()
Exemple #5
0
def generate_quality_report(input: str,
                            output: str,
                            force: bool,
                            bblfsh: str,
                            config: dict,
                            database: Optional[str] = None,
                            fs: Optional[str] = None) -> None:
    """
    Generate quality report for the given data. Entry point for command line interface.

    :param input: csv file with repositories to make report. Should contain url, to and from \
                  columns.
    :param output: Directory where to save results.
    :param force: force to overwrite results stored in output directory if True. \
                  Stored results will be used if False.
    :param bblfsh: bblfsh address to use.
    :param config: config for FormatAnalyzer.
    :param database: sqlite3 database path to store the models. Temporary file is used if not set.
    :param fs: Model repository file system root. Temporary directory is used if not set.
    :return:
    """
    os.makedirs(output, exist_ok=True)
    assert os.path.isdir(output), "Output should be a directory"
    log = logging.getLogger("QualityAnalyzer")
    handler = logging.handlers.RotatingFileHandler(
        os.path.join(output, "errors.txt"))
    handler.setLevel(logging.ERROR)
    log.addHandler(handler)
    if not server.exefile.exists():
        server.fetch()  # download executable
    reports = []
    port = server.find_port()
    config = {
        QualityReportAnalyzer.name: merge_dicts(config, {"aggregate": True})
    }
    repositories = list(csv.DictReader(handle_input_arg(input)))
    with tempfile.TemporaryDirectory() as tmpdirname:
        database = database if database else os.path.join(
            tmpdirname, "db.sqlite3")
        fs = fs if fs else os.path.join(tmpdirname, "models")
        os.makedirs(fs, exist_ok=True)
        with AnalyzerContextManager(QualityReportAnalyzer,
                                    port=port,
                                    db=database,
                                    fs=fs,
                                    init=False):
            start_time = datetime.now()
            for ri, row in enumerate(repositories):
                now = datetime.now()
                if ri > 0:
                    left = (len(repositories) - ri) / ri * (now - start_time)
                else:
                    left = None
                log.info(
                    "\n%s\n"
                    "= %-76s =\n"
                    "= %2d / %2d%s=\n"
                    "= Now:  %-60s%s=\n"
                    "= Left: %-40s%s=\n"
                    "= Ends: %-60s%s=\n"
                    "%s",
                    "=" * 80,
                    row["url"],
                    ri + 1,
                    len(repositories),
                    " " * 70,
                    now,
                    " " * 11,
                    left,
                    " " * 31,
                    now + left if left is not None else None,
                    " " * 11,
                    "=" * 80,
                )
                report_loc = os.path.join(output, get_repo_name(row["url"]))
                train_rep_loc = report_loc + ".train_report.md"
                model_rep_loc = report_loc + ".model_report.md"
                test_rep_loc = report_loc + ".test_report.md"
                # generate or read report
                try:
                    if force or not os.path.exists(train_rep_loc) or \
                            not os.path.exists(model_rep_loc):
                        # Skip this step if report was already generated
                        vnodes_expected_number = int(row["vnodes_number"]) \
                            if "vnodes_number" in row else None
                        report = measure_quality(
                            row["url"],
                            to_commit=row["to"],
                            from_commit=row["from"],
                            port=port,
                            config=config,
                            bblfsh=bblfsh,
                            vnodes_expected_number=vnodes_expected_number)
                        if report.train_report is not None:
                            with open(train_rep_loc, "w",
                                      encoding="utf-8") as f:
                                f.write(report.train_report)
                        if report.model_report is not None:
                            with open(model_rep_loc, "w",
                                      encoding="utf-8") as f:
                                f.write(report.model_report)
                        if report.test_report is not None:
                            with open(test_rep_loc, "w",
                                      encoding="utf-8") as f:
                                f.write(report.test_report)
                    else:
                        log.info("Found existing reports for %s in %s",
                                 row["url"], output)
                        report = QualityReport()
                        with open(train_rep_loc, encoding="utf-8") as f:
                            report.train_report = f.read()
                        with open(model_rep_loc, encoding="utf-8") as f:
                            report.model_report = f.read()
                        with open(test_rep_loc, encoding="utf-8") as f:
                            report.test_report = f.read()
                    if (report.train_report is not None
                            and report.model_report is not None
                            and report.test_report is not None):
                        reports.append((row["url"], report))
                    else:
                        log.warning(
                            "skipped %s: train_report %s, model_report %s, test_report %s",
                            row["url"], report.train_report is not None,
                            report.model_report is not None, report.test_report
                            is not None)
                except Exception:
                    log.exception("-" * 20 + "\nFailed to process %s repo",
                                  row["url"])
                    continue

        for report_name in ("train_report", "test_report"):
            summary = _generate_report_summary(reports, report_name)
            log.info("\n%s\n%s", report_name, summary)
            summary_loc = os.path.join(output, "summary-%s.md" % report_name)
            with open(summary_loc, "w", encoding="utf-8") as f:
                f.write(summary)
Exemple #6
0
def main(args):
    """Entry point for quality report generation."""
    os.makedirs(args.output, exist_ok=True)
    assert os.path.isdir(args.output), "Output should be a directory"
    slogging.setup(args.log_level, False)
    log = logging.getLogger("QualityAnalyzer")
    handler = logging.handlers.RotatingFileHandler(
        os.path.join(args.output, "errors.txt"))
    handler.setLevel(logging.ERROR)
    log.addHandler(handler)
    if not server.exefile.exists():
        server.fetch()  # download executable
    # prepare output directory
    reports = []

    port = server.find_port()
    review_config = {QualityReportAnalyzer.name: {"aggregate": True}}
    train_config = json.loads(args.train_config)

    with tempfile.TemporaryDirectory() as tmpdirname:
        database = args.database if args.database else os.path.join(
            tmpdirname, "db.sqlite3")
        fs = args.fs if args.fs else os.path.join(tmpdirname, "models")
        os.makedirs(fs, exist_ok=fs)
        with AnalyzerContextManager(
                port=port,
                db=database,
                fs=fs,
                analyzer="lookout.style.format.benchmarks.general_report",
                init=False):
            start_time = datetime.now()
            for ri, repo in enumerate(REPOSITORIES):
                repo, to_commit, from_commit = repo.split()
                now = datetime.now()
                if ri > 0:
                    left = (len(REPOSITORIES) - ri) / ri * (now - start_time)
                else:
                    left = None
                log.info(
                    "\n%s\n"
                    "= %-76s =\n"
                    "= %2d / %2d%s=\n"
                    "= Now:  %-60s%s=\n"
                    "= Left: %-40s%s=\n"
                    "= Ends: %-60s%s=\n"
                    "%s",
                    "=" * 80,
                    repo,
                    ri + 1,
                    len(REPOSITORIES),
                    " " * 70,
                    now,
                    " " * 11,
                    left,
                    " " * 31,
                    now + left if left is not None else None,
                    " " * 11,
                    "=" * 80,
                )
                report_loc = os.path.join(args.output, get_repo_name(repo))
                quality_rep_loc = report_loc + ".quality_report.md"
                model_rep_loc = report_loc + ".model_report.md"
                # generate or read report
                try:
                    if args.force or not os.path.exists(quality_rep_loc) or \
                            not os.path.exists(model_rep_loc):
                        # Skip this step if report was already generated
                        report = measure_quality(repo,
                                                 to_commit=to_commit,
                                                 from_commit=from_commit,
                                                 port=port,
                                                 review_config=review_config,
                                                 train_config=train_config,
                                                 bblfsh=args.bblfsh)
                        if report.quality is not None:
                            with open(quality_rep_loc, "w",
                                      encoding="utf-8") as f:
                                f.write(report.quality)
                        if report.model is not None:
                            with open(model_rep_loc, "w",
                                      encoding="utf-8") as f:
                                f.write(report.model)
                    else:
                        report = QualityReport()
                        with open(quality_rep_loc, encoding="utf-8") as f:
                            report.quality = f.read()
                        with open(model_rep_loc, encoding="utf-8") as f:
                            report.model = f.read()
                    if report.quality is not None and report.model is not None:
                        reports.append((repo, report))
                    else:
                        log.warning("skipped %s: quality %s model %s", repo,
                                    report.quality is not None, report.model
                                    is not None)
                except Exception:
                    log.exception("-" * 20 + "\nFailed to process %s repo",
                                  repo)
                    continue

        # precision, recall, f1, support, n_rules, avg_len stats
        table = []
        fields2id = OrderedDict()
        additional_fields = ("Rules Number", "Average Rule Len")
        with io.StringIO() as output:
            for repo, report in reports:
                metrics = _get_metrics(report.quality)
                if not table:
                    table.append(("repo", ) + metrics._fields +
                                 additional_fields)
                    for i, field in enumerate(table[0]):
                        fields2id[field] = i
                n_rules, avg_len = _get_model_summary(report.model)
                table.append((get_repo_name(repo), ) + metrics +
                             (n_rules, avg_len))
            average = tuple(
                ("%" + FLOAT_PRECISION) % calc_avg(table[1:], fields2id[field])
                for field in metrics._fields)
            average += tuple(
                ("%" + FLOAT_PRECISION) % calc_avg(table[1:], fields2id[field])
                for field in additional_fields)
            fields_to_weight = (
                ("precision", "support"),
                ("recall", "support"),
                ("full_recall", "full_support"),
                ("f1", "support"),
                ("full_f1", "full_support"),
                ("ppcr", "support"),
            )
            weighted_average = []
            for field, weight_field in fields_to_weight:
                weighted_average.append(
                    ("%" + FLOAT_PRECISION) %
                    calc_weighted_avg(table[1:],
                                      col=fields2id[field],
                                      weight_col=fields2id[weight_field]))
            table.append(("Average", ) + average)
            table.append(("Weighted average", ) + tuple(weighted_average))
            float_fields = ("precision", "recall", "full_recall", "f1",
                            "full_f1", "ppcr")
            floatfmts = []
            for field in fields2id:
                if field in float_fields:
                    floatfmts.append(FLOAT_PRECISION)
                else:
                    floatfmts.append("g")

            print(tabulate(table,
                           tablefmt="pipe",
                           headers="firstrow",
                           floatfmt=floatfmts),
                  file=output)
            summary = output.getvalue()
        print(summary)
        summary_loc = os.path.join(args.output, "summary.md")
        with open(summary_loc, "w", encoding="utf-8") as f:
            f.write(summary)