def run():
    if not git_is_available():
        msg = "contribcompl requires git to be installed and accessible on path"
        print(msg)
        sys.exit(1)

    arguments = docopt(__doc__, version=__version__)

    if arguments["-v"] == True or arguments["--verbose"] == True:
        toggle_verbose_output()

    path_to_repo = arguments["<repository>"]
    if not (is_git_url(path_to_repo) or is_git_dir(path_to_repo)):
        print(__doc__)
        sys.exit(1)

    if is_git_url(path_to_repo):
        path_to_repo = clone_to_tmp(path_to_repo)

    if arguments["commits"]:
        commit_shas = arguments["<commit_sha>"]
    elif arguments["issue"]:
        issue_re = arguments["<issue_regex>"]
        commit_shas = find_commits_for_issue(path_to_repo, issue_re)
        # print(commit_shas)

    contribcompl = compute_contrib_compl(path_to_repo, commit_shas)
    print(contribcompl)
def test_compute_contrib_compl_low():
    path_to_repo = "/tmp/cassandra/"
    commit_shas = [
        "021df085074b761f2b3539355ecfc4c237a54a76",
        "2f1d6c7254342af98c2919bd74d37b9944c41a6b",
    ]

    result = compute_contrib_compl(path_to_repo, commit_sha)
    assert result == ContributionComplexity.LOW
Beispiel #3
0
def main(sys_name):

    df = pd.read_csv(f"data/{sys_name}_issues.csv")

    if sys_name == "cassandra":
        closed_col = "resolved"
        issue_key_col = "key"
    elif sys_name == "gaffer":
        closed_col = "closed_at"
        issue_key_col = "number"

    # Filter for only resolved issues: 2300 closed issues for Gaffer and
    # 14158 closed issues for Cassandra
    df = df[~df[closed_col].isnull()]
    df.reset_index(drop=True, inplace=True)

    # df = df.iloc[:10]

    path_to_repo = f"/tmp/{sys_name}"
    contribcompls = []
    commit_shas_per_contrib = []

    for issue_key in df[issue_key_col].values:
        if sys_name == "cassandra":
            issue_re = f"{issue_key}( |$)"
        elif sys_name == "gaffer":
            issue_re = f"(Gh |gh-){issue_key}( |$)"

        commit_shas = find_commits_for_issue(path_to_repo, issue_re)
        commit_shas_per_contrib.append(commit_shas)
        print(issue_re, commit_shas, flush=True)

        if commit_shas:
            try:
                contribcompl = compute_contrib_compl(path_to_repo, commit_shas)
            except:
                print(
                    f"Skipping {issue_key}",
                    issue_re,
                    commit_shas,
                    type(commit_shas),
                    flush=True,
                )
                contribcompl = None
            contribcompl = contribcompl.value
        else:
            contribcompl = None
        contribcompls.append(contribcompl)

    df["commit_shas"] = commit_shas_per_contrib
    df["contrib_complexity"] = contribcompls

    # Prevent sha lists from being truncated
    np.set_printoptions(threshold=sys.maxsize)
    df.to_csv(f"data/{sys_name}_contrib_compl.csv", index=False)
def get_complete_issue_df(sys_name):
    procfname = f"data/processing/{sys_name[:3]}_issues.csv"
    if os.path.isfile(procfname):
        date_cols = ["created", "resolved", "updated"]
        df_iss = pd.read_csv(procfname, parse_dates=date_cols)
        # parse commits again in
        df_iss.commit_shas = [
            eval(h.replace("\n", ",")) if type(h) == str else ""
            for h in df_iss.commit_shas
        ]
        df_iss.t_lead = pd.to_timedelta(df_iss.t_lead)
        return df_iss

    # For first time preprocessing
    fname_issues = f"data/input/{sys_name[:3]}_issues.csv"
    if sys_name == "cassandra":
        date_cols = ["created", "resolved", "updated"]
    elif sys_name == "gaffer":
        date_cols = ["created_at", "closed_at", "updated_at"]
    df_iss = pd.read_csv(fname_issues, parse_dates=date_cols)

    if sys_name == "cassandra":
        # Adjust priority values to same format as for Gaffer
        df_iss["priority"] = df_iss.priority.str.lower()
        df_iss.rename(columns={"description": "body"}, inplace=True)
        df_iss.status = df_iss.status.str.lower()
        # Rename resolved to closed as for Gaffer
        df_iss[df_iss.status == "resolved"].status = "closed"
        burl = "https://issues.apache.org/jira/browse/"
        df_iss["url"] = df_iss.key.apply(lambda k: burl + k)

    if sys_name == "gaffer":
        # Rename the columns to allow for unique code in the following
        df_iss.rename(columns={"created_at": "created"}, inplace=True)
        df_iss.rename(columns={"closed_at": "resolved"}, inplace=True)
        df_iss.rename(columns={"updated_at": "updated"}, inplace=True)
        df_iss.rename(columns={"number": "key"}, inplace=True)
        # df_iss.rename(columns={"labels": "issue_type"}, inplace=True)
        df_iss.rename(columns={"state": "status"}, inplace=True)
        # clean the data
        df_iss["labels"] = df_iss.labels.apply(labels_to_list)
        df_iss["issue_type"] = df_iss.labels.apply(remove_priority)
        df_iss["priority"] = df_iss.labels.apply(extract_priority)
        # Does not exist in Github Isssue tracker
        df_iss["resolution"] = df_iss.issue_type.apply(lambda _: "")

    # Compute lead time
    df_iss["t_lead"] = df_iss.resolved - df_iss.created
    df_iss["t_lead_s"] = df_iss.t_lead.dt.total_seconds()

    # df_iss = df_iss.loc[:200]

    # Find related commits
    path_to_repo = f"{os.getenv('HOME')}/case_systems/{sys_name}"
    contribcompls = []
    commit_shas_per_contrib = []
    for issue_key in tqdm(df_iss["key"].values):
        if sys_name == "cassandra":
            issue_re = f"{issue_key}( |$)"
        elif sys_name == "gaffer":
            issue_re = f"(Gh |gh-){issue_key}( |$)"

        commit_shas = find_commits_for_issue(path_to_repo, issue_re)
        commit_shas_per_contrib.append(commit_shas)

        if commit_shas:
            try:
                contribcompl = compute_contrib_compl(path_to_repo, commit_shas)
            except:
                print(
                    f"Skipping {issue_key}",
                    issue_re,
                    commit_shas,
                    type(commit_shas),
                    flush=True,
                )
                contribcompl = None
            contribcompl = contribcompl.value
        else:
            contribcompl = None
        contribcompls.append(contribcompl)

    df_iss["commit_shas"] = commit_shas_per_contrib
    # Compute Contribution Complexities ... This takes multiple hours
    df_iss["contrib_complexity"] = contribcompls

    cols = [
        "key",
        "title",
        "body",
        "created",
        "updated",
        "resolved",
        "status",
        "issue_type",
        "labels",
        "priority",
        "resolution",
        "t_lead",
        "t_lead_s",
        "url",
        "commit_shas",
        "contrib_complexity",
    ]
    np.set_printoptions(threshold=sys.maxsize)
    df_iss[cols].to_csv(procfname, index=False)

    return df_iss[cols]
def test_compute_contrib_compl_high():
    path_to_repo = "/tmp/cassandra/"
    commit_shas = ["a991b64811f4d6adb6c7b31c0df52288eb06cf19"]

    result = compute_contrib_compl(path_to_repo, commit_sha)
    assert result == ContributionComplexity.HIGH