Esempio n. 1
0
def show_line_chart_for_pr_rejection_rates_over_time(
        pulls, based_on_devstats_data=False, companies=[]):
    pulls = pulls.dropna(subset=["created_at", "closed_at"])

    if based_on_devstats_data:
        pulls = p.determine_company_for_issues_with_history(pulls)
        pulls['company'] = np.where(pulls['company'].isin(companies),
                                    pulls['company'], 'unknown')
    else:
        users = c.get_issue_authors_with_company("kubernetes", "kubernetes")
        pulls = p.merge_issues_with_company_column(pulls, users)

    pulls["company"].fillna("others", inplace=True)
    pulls = p.add_dummy_column_for_rounded_year(pulls)
    # pulls = p.add_dummy_column_for_month(pulls)
    pulls = p.add_dummy_column_for_pr_merge_state(pulls)
    # companies = set(pulls["company"].values)
    # _normalized_stacked_chart(pulls, "company", "month", "line")
    df = pulls.groupby(["company",
                        "year"])["pr_is_merged"].mean().unstack(level=0)
    print(df)
    plt = df.plot(kind="line")
    plt.set_ylabel("acceptance rate")
    plt.set_title("PR Acceptance Rate over the Community Lifetime",
                  fontsize=10)
Esempio n. 2
0
def compare_users_with_devstats_data():
    datastore = c.get_devstats_user()
    df = pd.DataFrame(datastore)
    devstats_users = list(set(df["login"].values))
    users_df = c.get_issue_authors_with_company("kubernetes", "kubernetes")
    users = list(set(users_df["user_login"].values))

    intersection = {user for user in users if user in devstats_users}
    print("# of crawled users: " + str(len(users)))
    print("# of dev stats users: " + str(len(devstats_users)))
    print("# of common users: " + str(len(intersection)))
    print("percentage of users covered by devstats: " +
          str(len(intersection) / len(users)))

    users_df["company"].fillna('unknown', inplace=True)
    users_df_without_company = users_df.loc[users_df["company"] == "unknown"]
    users_without_company = list(
        set(users_df_without_company["user_login"].values))
    print("# of users without company affiliation: " +
          str(len(users_without_company)))
    user_w_company_in_devstats = {
        user
        for user in users_without_company if user in devstats_users
    }
    print("# of users with company affiliation in devstats: " +
          str(len(user_w_company_in_devstats)))
Esempio n. 3
0
def add_company_column_for_issues(org, repo):
    users = c.get_issue_authors_with_company(org, repo)
    issues = c.get_issues(org, repo)
    issues_with_company = merge_issues_with_company_column(issues, users)
    issues_with_company.to_csv(org + "_" + repo + "_" + c.issue_file_suffix +
                               "with_employer",
                               sep='\t')
Esempio n. 4
0
def verify_data_consistency_for_crawled_issues_and_comments_by_checking_coherent_company_representation(
        org, repo):
    print(
        "\nCompany representation in issues, in issues with processing time and in issues with comments and reponse time:"
    )
    print(
        pd.DataFrame.from_dict(
            _filter_by_frequency(
                Counter(c.get_issues_with_company(org, repo).company.values),
                1000)))
    print(
        pd.DataFrame.from_dict(
            _filter_by_frequency(
                Counter(
                    c.get_issues_with_processing_time(org,
                                                      repo).company.values),
                1000)))
    print(
        pd.DataFrame.from_dict(
            _filter_by_frequency(
                Counter(
                    c.get_issues_with_response_time(org, repo).company.values),
                1000)))

    print("\nContributor company affiliation baseline:")
    print(
        pd.DataFrame.from_dict(
            _filter_by_frequency(
                Counter(
                    c.get_issue_authors_with_company(org,
                                                     repo).company.values),
                50)))
    print(
        "\nContributor company affiliation distribution in issues, in issues with processing time and in issues with comments and reponse time:"
    )
    print(
        pd.DataFrame.from_dict(
            _filter_by_frequency(
                Counter(
                    c.get_issues_with_company(org, repo)[[
                        "user_login", "company"
                    ]].drop_duplicates(subset=["user_login"]).company.values),
                50)))
    print(
        pd.DataFrame.from_dict(
            _filter_by_frequency(
                Counter(
                    c.get_issues_with_processing_time(org, repo)[[
                        "user_login", "company"
                    ]].drop_duplicates(subset=["user_login"]).company.values),
                50)))
    print(
        pd.DataFrame.from_dict(
            _filter_by_frequency(
                Counter(
                    c.get_issues_with_response_time(org, repo)[[
                        "user_login", "company"
                    ]].drop_duplicates(subset=["user_login"]).company.values),
                50)))
Esempio n. 5
0
def get_employer_for_contributors(org, repo, based_on_devstats_data=False):
    if based_on_devstats_data:
        authors_df = get_formatted_devstats_user()
        return authors_df.fillna('').set_index(
            'user_login')['affiliation'].to_dict()
    else:
        authors_df = c.get_issue_authors_with_company(org, repo)
        return authors_df.fillna('').set_index(
            'user_login')['company'].to_dict()
Esempio n. 6
0
def compare_contributor_company_affiliation_with_devstats_data(org, repo):
    companies = list(c.get_companies(org, repo).keys())
    crawled_users = c.get_issue_authors_with_company(org, repo)
    devstats_users = get_formatted_devstats_user()
    devstats_users.rename(columns={'company': 'last_employer'}, inplace=True)
    users_with_devstats_info = pd.merge(
        crawled_users,
        devstats_users[["user_login", "email", "affiliation",
                        "last_employer"]],
        how="left",
        on=["user_login"])

    # users identified overall
    users_identified_with_devstats = users_with_devstats_info.loc[
        users_with_devstats_info["last_employer"].isin(companies)]
    users_identified = users_with_devstats_info.loc[
        users_with_devstats_info["company"].isin(companies)]
    print("Overall users: " + str(len(users_with_devstats_info.index)) +
          "\n-> with identified employers: " +
          str(len(users_identified.index)) +
          "\n-> with devstats user affiliaton data identified employers: " +
          str(len(users_identified_with_devstats.index)) + "\n")

    # users identified in just on dataset (either with crawled  or with devstats information)
    users_not_identified = users_with_devstats_info.loc[
        users_with_devstats_info["company"].isnull()]
    users_identified_with_devstats = users_not_identified.loc[
        users_not_identified["last_employer"].isin(companies)]
    print(users_identified_with_devstats[[
        "user_login", "company", "affiliation", "last_employer"
    ]])
    users_not_identified_with_devstats = users_with_devstats_info.loc[
        ~users_with_devstats_info["last_employer"].isin(companies)]
    users_identified = users_not_identified_with_devstats.loc[
        users_not_identified_with_devstats["company"].notnull()]
    print(users_identified[[
        "user_login", "company", "affiliation", "last_employer"
    ]])

    # users' company affiliation conflicting with devstats data
    users_with_devstats_info = users_with_devstats_info.dropna(
        subset=["company"])
    users_with_devstats_info = users_with_devstats_info.dropna(
        subset=["last_employer"])
    users_with_devstats_info = users_with_devstats_info.loc[
        users_with_devstats_info["last_employer"].isin(companies)]
    conflicting_users = users_with_devstats_info.loc[
        users_with_devstats_info["company"] !=
        users_with_devstats_info["last_employer"]]
    print(conflicting_users[[
        "user_login", "company", "affiliation", "last_employer"
    ]])
Esempio n. 7
0
def show_area_chart_for_pr_rejection_rates_over_time(pulls):
    users = c.get_issue_authors_with_company("kubernetes", "kubernetes")
    pulls = p.merge_issues_with_company_column(pulls, users)
    pulls["company"].fillna("others", inplace=True)

    pulls = pulls.dropna(subset=["created_at", "closed_at"])
    pulls = p.add_dummy_column_for_month(pulls)
    pulls = p.add_dummy_column_for_pr_merge_state(pulls)
    companies = set(pulls["company"].values)
    _, axs = plt.subplots(nrows=len(companies))
    for i, company in enumerate(companies):
        print(company)
        company_pulls = pulls.loc[pulls["company"] == company]
        _normalized_stacked_chart(company_pulls, "pr_is_merged", "month",
                                  "area", axs[i])
Esempio n. 8
0
def get_users(org, repo, based_on_devstats_data):
    if based_on_devstats_data:
        return get_formatted_devstats_user()
    else:
        return c.get_issue_authors_with_company(org, repo)