def get_modules_from_db(repo_ids):
     repo_ids = [r.replace("_", "@") for r in repo_ids]
     ms: List[Module] = Module.select() \
         .join(Repo) \
         .group_by(Repo.name) \
         .where(Repo.id << repo_ids)
     return ms
def draw_count_test_cases():
    ms: List[Module] = Module.select() \
        .join(Repo) \
        .group_by(Repo.name) \
        .order_by(Module.total_cases.desc())
    df = pd.DataFrame(list(ms.dicts()))

    f, ax = plt.subplots(figsize=(10, 5))

    bin_column_name = "bin_tc"
    column_to_bin = "total_cases"
    bins = [1, 2, 6, 10, 20, 50, 100, 2000]
    bin_labels = [f"[{l}-{r}]" for l, r in zip(bins[:-1], bins[1:])]
    kw = {
        bin_column_name:
        lambda x: pd.cut(x[column_to_bin], bins=bins, labels=bin_labels)
    }

    df: pd.DataFrame = df.assign(**kw)
    color = sns.color_palette()[0]
    sns.countplot(data=df, x=bin_column_name, color=color)
    x_label = "Suite size"
    y_label = "Repositories count"
    ax.set(xlabel=x_label, ylabel=y_label)
    total = len(df)
    # plt.title(f"Repositories by test suite size (total: {total})")
    plt.tight_layout()
    plt.savefig("good_repos_by_suite_size_bin.png",
                dpi=300,
                transparent=transparent_bg)
def get_modules_from_db(repo_ids):
    to_id = lambda s: s[:-41] + "@" + s[-40:]
    repo_ids = [to_id(r) for r in repo_ids]
    print(repo_ids)
    ms: List[Module] = Module.select() \
        .join(Repo) \
        .where(Repo.id << repo_ids)
    return ms
def draw_coverage_goals():
    ms: List[Module] = Module.select() \
        .join(Repo) \
        .group_by(Repo.name) \
        .order_by(Module.total_cases.desc())
    df = pd.DataFrame(list(ms.dicts()))

    humanize = lambda l: HUMAN_READABLE_LABELS[l]

    df["all_pairs"] = df[["m_pairs", "im_pairs", "ic_pairs"]].sum(axis=1)

    for col, readable_col in HUMAN_READABLE_LABELS.items():
        df[readable_col] = df[col]
        df[col] = None

    coverage_goals_cols = ["statements", "branches", "all_pairs"]
    pairs_cols = ["m_pairs", "im_pairs", "ic_pairs"]

    coverage_goals_cols = list(map(humanize, coverage_goals_cols))

    # coverage goals
    f, ax = plt.subplots(figsize=(10, 5))
    sns.boxplot(data=df.loc[:, coverage_goals_cols])
    if for_presentation:
        make_box_plot_white(ax)

    x_label = "Goal types"
    y_label = "Goals count"
    ax.set(xlabel=x_label, ylabel=y_label)
    plt.tight_layout()
    plt.savefig("coverage_goals_all_unique.png",
                dpi=300,
                transparent=transparent_bg)

    # pair detailed
    pairs_cols = list(map(humanize, pairs_cols))
    f, ax = plt.subplots(figsize=(10, 5))
    sns.boxplot(data=df.loc[:, pairs_cols])
    if for_presentation:
        make_box_plot_white(ax)

    x_label = "Definition-use pair types"
    y_label = "Pairs count"
    ax.set(xlabel=x_label, ylabel=y_label)
    plt.tight_layout()
    plt.savefig("pairs_detailed_all_unique.png",
                dpi=300,
                transparent=transparent_bg)
        pragmas={
            "journal_mode": "wal",
            "cache_size": -64 * 1000,
            "foreign_key": 1,
            "ignore_check_constraints": 9,
            "synchronous": 0,
        }
    )

    DATABASE_PROXY.initialize(db)
    ms: List[Module] = Module.select() \
        .where(Module.st_cov > 70) \
        .where(Module.du_cov > 70) \
        .where(Module.br_cov > 70) \
        .where(Module.bugs < 8) \
        .where(Module.statements > 50) \
        .where(Module.total_cases > 20) \
        .where(Module.total_cases < 1000) \
        .where(Module.is_full_cfg == 1) \
        .group_by(Module.path) \
        .order_by(Module.total_cases.desc()) \
        # .limit(10)
    print(len(list(ms)))
    # exit()
    print("\n".join(list(
        map(lambda t: " ".join(t),
            zip(
                map(str, map(operator.attrgetter("path"), ms)),
                map(str, map(operator.attrgetter("path"), ms)),
                map(str, map(operator.attrgetter("total_cases"), ms)),
                map(str, map(operator.attrgetter("statements"), ms))
    database = "../selection.db"

    db = SqliteExtDatabase(database,
                           pragmas={
                               "journal_mode": "wal",
                               "cache_size": -64 * 1000,
                               "foreign_key": 1,
                               "ignore_check_constraints": 9,
                               "synchronous": 0,
                           })
    DATABASE_PROXY.initialize(db)
    # .where(Module.total_cases < 100) \
    #     .where(Module.bugs < 8) \
    ms: List[Module] = Module.select() \
        .join(Repo) \
        .group_by(Repo.name) \
        .order_by(Module.total_cases.desc()) \
        .offset(off) \
        .limit(lim)
    ms = list(ms)
    repos = []

    for m in ms:
        repo_id = m.repo.id
        repo_id = repo_id.replace("@", "_")
        df = read_combined_df(data_path, repo_id, DataFrameType.FIXED_SIZE)
        if df is not None:
            # print(df)
            metrics = df[METRIC].unique()
            if len(metrics) < 3:
                continue
            by_metric = df.groupby(METRIC)[SUITE_COVERAGE].agg(["min", "max"])