def show_line_chart_for_pr_rejection_rates_over_time( pulls, based_on_devstats_data=False, companies=[]): pulls = pulls.dropna(subset=["created_at", "closed_at"]) if based_on_devstats_data: pulls = p.determine_company_for_issues_with_history(pulls) pulls['company'] = np.where(pulls['company'].isin(companies), pulls['company'], 'unknown') else: users = c.get_issue_authors_with_company("kubernetes", "kubernetes") pulls = p.merge_issues_with_company_column(pulls, users) pulls["company"].fillna("others", inplace=True) pulls = p.add_dummy_column_for_rounded_year(pulls) # pulls = p.add_dummy_column_for_month(pulls) pulls = p.add_dummy_column_for_pr_merge_state(pulls) # companies = set(pulls["company"].values) # _normalized_stacked_chart(pulls, "company", "month", "line") df = pulls.groupby(["company", "year"])["pr_is_merged"].mean().unstack(level=0) print(df) plt = df.plot(kind="line") plt.set_ylabel("acceptance rate") plt.set_title("PR Acceptance Rate over the Community Lifetime", fontsize=10)
def compare_users_with_devstats_data(): datastore = c.get_devstats_user() df = pd.DataFrame(datastore) devstats_users = list(set(df["login"].values)) users_df = c.get_issue_authors_with_company("kubernetes", "kubernetes") users = list(set(users_df["user_login"].values)) intersection = {user for user in users if user in devstats_users} print("# of crawled users: " + str(len(users))) print("# of dev stats users: " + str(len(devstats_users))) print("# of common users: " + str(len(intersection))) print("percentage of users covered by devstats: " + str(len(intersection) / len(users))) users_df["company"].fillna('unknown', inplace=True) users_df_without_company = users_df.loc[users_df["company"] == "unknown"] users_without_company = list( set(users_df_without_company["user_login"].values)) print("# of users without company affiliation: " + str(len(users_without_company))) user_w_company_in_devstats = { user for user in users_without_company if user in devstats_users } print("# of users with company affiliation in devstats: " + str(len(user_w_company_in_devstats)))
def add_company_column_for_issues(org, repo): users = c.get_issue_authors_with_company(org, repo) issues = c.get_issues(org, repo) issues_with_company = merge_issues_with_company_column(issues, users) issues_with_company.to_csv(org + "_" + repo + "_" + c.issue_file_suffix + "with_employer", sep='\t')
def verify_data_consistency_for_crawled_issues_and_comments_by_checking_coherent_company_representation( org, repo): print( "\nCompany representation in issues, in issues with processing time and in issues with comments and reponse time:" ) print( pd.DataFrame.from_dict( _filter_by_frequency( Counter(c.get_issues_with_company(org, repo).company.values), 1000))) print( pd.DataFrame.from_dict( _filter_by_frequency( Counter( c.get_issues_with_processing_time(org, repo).company.values), 1000))) print( pd.DataFrame.from_dict( _filter_by_frequency( Counter( c.get_issues_with_response_time(org, repo).company.values), 1000))) print("\nContributor company affiliation baseline:") print( pd.DataFrame.from_dict( _filter_by_frequency( Counter( c.get_issue_authors_with_company(org, repo).company.values), 50))) print( "\nContributor company affiliation distribution in issues, in issues with processing time and in issues with comments and reponse time:" ) print( pd.DataFrame.from_dict( _filter_by_frequency( Counter( c.get_issues_with_company(org, repo)[[ "user_login", "company" ]].drop_duplicates(subset=["user_login"]).company.values), 50))) print( pd.DataFrame.from_dict( _filter_by_frequency( Counter( c.get_issues_with_processing_time(org, repo)[[ "user_login", "company" ]].drop_duplicates(subset=["user_login"]).company.values), 50))) print( pd.DataFrame.from_dict( _filter_by_frequency( Counter( c.get_issues_with_response_time(org, repo)[[ "user_login", "company" ]].drop_duplicates(subset=["user_login"]).company.values), 50)))
def get_employer_for_contributors(org, repo, based_on_devstats_data=False): if based_on_devstats_data: authors_df = get_formatted_devstats_user() return authors_df.fillna('').set_index( 'user_login')['affiliation'].to_dict() else: authors_df = c.get_issue_authors_with_company(org, repo) return authors_df.fillna('').set_index( 'user_login')['company'].to_dict()
def compare_contributor_company_affiliation_with_devstats_data(org, repo): companies = list(c.get_companies(org, repo).keys()) crawled_users = c.get_issue_authors_with_company(org, repo) devstats_users = get_formatted_devstats_user() devstats_users.rename(columns={'company': 'last_employer'}, inplace=True) users_with_devstats_info = pd.merge( crawled_users, devstats_users[["user_login", "email", "affiliation", "last_employer"]], how="left", on=["user_login"]) # users identified overall users_identified_with_devstats = users_with_devstats_info.loc[ users_with_devstats_info["last_employer"].isin(companies)] users_identified = users_with_devstats_info.loc[ users_with_devstats_info["company"].isin(companies)] print("Overall users: " + str(len(users_with_devstats_info.index)) + "\n-> with identified employers: " + str(len(users_identified.index)) + "\n-> with devstats user affiliaton data identified employers: " + str(len(users_identified_with_devstats.index)) + "\n") # users identified in just on dataset (either with crawled or with devstats information) users_not_identified = users_with_devstats_info.loc[ users_with_devstats_info["company"].isnull()] users_identified_with_devstats = users_not_identified.loc[ users_not_identified["last_employer"].isin(companies)] print(users_identified_with_devstats[[ "user_login", "company", "affiliation", "last_employer" ]]) users_not_identified_with_devstats = users_with_devstats_info.loc[ ~users_with_devstats_info["last_employer"].isin(companies)] users_identified = users_not_identified_with_devstats.loc[ users_not_identified_with_devstats["company"].notnull()] print(users_identified[[ "user_login", "company", "affiliation", "last_employer" ]]) # users' company affiliation conflicting with devstats data users_with_devstats_info = users_with_devstats_info.dropna( subset=["company"]) users_with_devstats_info = users_with_devstats_info.dropna( subset=["last_employer"]) users_with_devstats_info = users_with_devstats_info.loc[ users_with_devstats_info["last_employer"].isin(companies)] conflicting_users = users_with_devstats_info.loc[ users_with_devstats_info["company"] != users_with_devstats_info["last_employer"]] print(conflicting_users[[ "user_login", "company", "affiliation", "last_employer" ]])
def show_area_chart_for_pr_rejection_rates_over_time(pulls): users = c.get_issue_authors_with_company("kubernetes", "kubernetes") pulls = p.merge_issues_with_company_column(pulls, users) pulls["company"].fillna("others", inplace=True) pulls = pulls.dropna(subset=["created_at", "closed_at"]) pulls = p.add_dummy_column_for_month(pulls) pulls = p.add_dummy_column_for_pr_merge_state(pulls) companies = set(pulls["company"].values) _, axs = plt.subplots(nrows=len(companies)) for i, company in enumerate(companies): print(company) company_pulls = pulls.loc[pulls["company"] == company] _normalized_stacked_chart(company_pulls, "pr_is_merged", "month", "area", axs[i])
def get_users(org, repo, based_on_devstats_data): if based_on_devstats_data: return get_formatted_devstats_user() else: return c.get_issue_authors_with_company(org, repo)