def speed_ccp_cochange_by_var(commits_per_user_file, fixed_variable, fixed_values): """ Note that repo_file_quality_per_year uses bug hit ratio and not ccp. For change analysis it doesn't matter. :param repo_file_quality_per_year: :return: """ key = 'repo_name' control_variables = [fixed_variable] trep = get_valid_repos() users_per_project = pd.read_csv(commits_per_user_file) users_per_project = users_per_project[ users_per_project.year > EARLIEST_ANALYZED_YEAR] per_year_df = pd.merge(users_per_project, trep, on='repo_name') cochange_analysis_by_value(per_year_df, first_metric='corrective_commits_ratio', second_metric='commits_per_above11_users', first_the_higher_the_better=False, second_the_higher_the_better=True, first_sig_threshold=0.1, second_sig_threshold=10, fixed_variable=fixed_variable, fixed_values=fixed_values, key=key, control_variables=control_variables)
def speed_ccp_cochange(commits_per_user_file): """ Note that repo_file_quality_per_year uses bug hit ratio and not ccp. For change analysis it doesn't matter. :param repo_file_quality_per_year: :return: """ key = 'repo_name' trep = get_valid_repos() trep = trep[['repo_name']] users_per_project = pd.read_csv(commits_per_user_file) users_per_project = users_per_project[users_per_project.year > 2014] per_year_df = pd.merge(users_per_project, trep, on='repo_name') per_year_df = per_year_df[[ 'repo_name', 'year', 'corrective_commits_ratio', 'commits_per_above11_users' ]] per_year_df = per_year_df.dropna() cochange_analysis(per_year_df, first_metric='corrective_commits_ratio', second_metric='commits_per_above11_users', first_the_higher_the_better=False, second_the_higher_the_better=True, first_sig_threshold=0.1, second_sig_threshold=10, key=key)
def build_repo_per_year_df(cochange_file , key , control_variables=[]): trep = get_valid_repos() trep = trep[[key] + control_variables] cochange_df = pd.read_csv(cochange_file) cochange_df = cochange_df[cochange_df.year > EARLIEST_ANALYZED_YEAR] df = pd.merge(cochange_df, trep, on=key) return df
def build_porting_pairs(): df = get_valid_repos() df['user'] = df.repo_name.map(lambda x: x.split('/')[0]) df['project'] = df.repo_name.map(lambda x: x.split('/')[1]) lang = lang_name + [i.lower() for i in lang_name] lang = [re.escape(i) for i in lang] lang_in_name = df[df.project.str.contains('|'.join(lang))] lang_in_name_by_user = lang_in_name.groupby(['user'], as_index=False).agg( {'repo_name': 'count'}) p = lang_in_name[lang_in_name.user.isin( lang_in_name_by_user[lang_in_name_by_user.repo_name > 1].user.tolist( ))].sort_values('user')[['repo_name', 'user', 'project', 'language']] p.to_csv(os.path.join(DATA_PATH, 'porting_pairs.csv'), index=False)
def plot_ccp_pdf(): df = get_valid_repos() plot_cdf_by_column(df, column_name='y2019_ccp', title='CDF of CCP', output_file='c:/tmp/ccp_by_dev_num_group_cdf.png', subsets_column='dev_num_group') plot_cdf_by_column(df, column_name='y2019_ccp', title='CDF of CCP', output_file='c:/tmp/ccp_by_age_group_cdf.png', subsets_column='age_group')
def coupling_analysis(coupling_file): trep = get_valid_repos() coupling_size = pd.read_csv(coupling_file) coupling_size = coupling_size[coupling_size.year == ANALYZED_YEAR] treps = pd.merge(trep, coupling_size, on='repo_name') print(treps.avg_capped_files.describe()) coupling_25_q = treps.avg_capped_files.quantile(0.25) print("coupling 25 quantile", coupling_25_q) coupling_75_q = treps.avg_capped_files.quantile(0.75) print("coupling 75 quantile", coupling_75_q) treps['coupling_group'] = treps.apply(lambda x: 'Lower 25' if x.avg_capped_files < coupling_25_q else "top 25" if x.avg_capped_files > coupling_75_q else "Middle", axis=1) print('top 10 prob', 1.0 * len(treps[treps.quality_group == 'Top 10']) / len(treps)) top_10_in_l25 = 1.0 * len(treps[(treps.quality_group == 'Top 10') & (treps.coupling_group == 'Lower 25')]) / len( treps[treps.coupling_group == 'Lower 25']) print('top 10 prob in lower 25', top_10_in_l25) top_10_in_t25 = 1.0 * len(treps[(treps.quality_group == 'Top 10') & (treps.coupling_group == 'top 25')]) / len( treps[treps.coupling_group == 'top 25']) print('top 10 prob in top 25', top_10_in_t25) print("short files lift ", top_10_in_l25 / top_10_in_t25 - 1) print("CCP in 4 top deciles" , round(treps[treps.avg_capped_files > 8.156].y2019_ccp.mean(),2)) group_by_size = treps.groupby(['coupling_group'], as_index=False).agg({'y2019_ccp': 'mean'}) print(group_by_size) size_df = run_file_size_analysis() joint = pd.merge(treps, size_df, on='repo_name') both_l25 = len(joint[(joint.coupling_group == 'Lower 25') & (joint.size_group == 'Lower 25')]) top_10_in_both_l25 = 1.0 * len(joint[(joint.quality_group_x == 'Top 10') & (joint.coupling_group == 'Lower 25') & (joint.size_group == 'Lower 25') ]) / both_l25 print('top 10 prob in lower 25 in coupling and size', top_10_in_both_l25) print('both lower 25', both_l25, "ratio", both_l25 / len(joint)) print("both lower 25 CCP", joint[(joint.coupling_group == 'Lower 25') & (joint.size_group == 'Lower 25')].y2019_ccp_x.mean()) return treps
def plot_dev_num(): df = get_valid_repos() cutting_points = [0, 1] \ + [int(df.authors.quantile(i*0.1)) for i in range(1,10)] \ + [int(df.authors.quantile(0.99))]\ + [float("inf")] df['dev_num_sets'] = pd.cut(df.authors, cutting_points) df = df.sort_values('dev_num_sets') plot_deciles(df, grouping_column='dev_num_sets', metric_column='y2019_ccp', title="Number of Developers vs. CCP", xaxis_title="Developers (single, deciles and 99%)", output_file=os.path.join(FIGURES_PATH, 'ccp_by_dev_num_boxplot.png'))
def Linus_rule(): df = get_valid_repos() df = check_name_redundency(df) selected_users = [ 'google', 'facebook', 'apache', 'angular', 'kubernetes', 'tensorflow' ] many_stars_threshhold = df.stargazers_count.quantile(0.95) print("many_stars_threshold 95%", many_stars_threshhold) df['many_stars'] = df.stargazers_count.map( lambda x: x > many_stars_threshhold) for i in selected_users: print(i) g = df[df.user == i].groupby(['many_stars'], as_index=False).agg({ 'y2019_ccp': 'mean', 'repo_name': 'count' }) print(g) print( "Many stars lift", round( g[g.many_stars].iloc[0].y2019_ccp / g[~g.many_stars].iloc[0].y2019_ccp - 1.0, 2)) df['selected_users_project'] = df.user.map(lambda x: x in selected_users) g = df.groupby(['selected_users_project'], as_index=False).agg({ 'y2019_ccp': 'mean', 'repo_name': 'count', 'age': 'mean', 'authors': 'mean', 'stargazers_count': 'mean' }) print(g) for i in ['y2019_ccp', 'age', 'authors', 'stargazers_count']: print( str(i) + " users", g[g.selected_users_project][i].iloc[0], "others", g[~g.selected_users_project][i].iloc[0], "lift", g[g.selected_users_project][i].iloc[0] / g[~g.selected_users_project][i].iloc[0] - 1)
def onboarding_analysis(onboarding_file): df = get_valid_repos() churn = pd.read_csv(onboarding_file) churn = churn[churn.year == ANALYZED_YEAR] df = pd.merge(df, churn, on='repo_name') g = df.groupby('quality_group', as_index=False).agg( {'comming_involved_developers_ratio': 'mean'}) print("Onboarding by quality group") print(g) print( "Lift", round( g[g.quality_group == 'Top 10'].iloc[0]. comming_involved_developers_ratio / g[g.quality_group == 'Others']. iloc[0].comming_involved_developers_ratio - 1.0, 2)) return df
def analyze_porting_pairs(): # After manual editing and selecting only suitable pairs df = get_valid_repos() df['user'] = df.repo_name.map(lambda x: x.split('/')[0]) df['project'] = df.repo_name.map(lambda x: x.split('/')[1]) p = pd.read_csv(os.path.join(DATA_PATH, 'porting_pairs.csv')) j = pd.merge(p, df[['repo_name', 'y2019_ccp']], on='repo_name') pairs = pd.merge(j, j, on='user') pairs = pairs[(pairs.project_x != pairs.project_y)] pairs['y_ccp_by_x'] = 1.0 * pairs.y2019_ccp_y / pairs.y2019_ccp_x g = pairs.groupby(['language_x', 'language_y']).agg({ 'project_x': 'count', 'y_ccp_by_x': {'mean', 'std'} }) print(g) pairs.to_csv(os.path.join(DATA_PATH, 'porting_pairs_ccp.csv'), index=False)
def quality_and_speed(commits_per_user_file): trep = get_valid_repos() trep = trep[['repo_name', 'quality_group']] users_per_project = pd.read_csv(commits_per_user_file) users_per_project = pd.merge(users_per_project, trep, on='repo_name') users_per_project_cur = users_per_project[users_per_project.year == ANALYZED_YEAR].copy() # commit_per_user users_per_project_cur[ 'commit_per_user'] = users_per_project_cur.commits / users_per_project_cur.users # users_capped_commit_per_user users_per_project_cur[ 'users_capped_commit_per_user'] = users_per_project_cur.users_capped_commit / users_per_project_cur.users g = users_per_project_cur.groupby(['quality_group'], as_index=False).agg({ 'repo_name': 'count', 'commit_per_user': '******', 'users_above_11_commits_per_above11_users': 'mean', 'users_capped_commit_per_user': '******', 'users_above_11_500_cap_per_above11_users': 'mean' }) print("quality and speed") print(g) print("Commit per user top 10 lift", (g[g.quality_group == 'Top 10'].iloc[0].commit_per_user / g[g.quality_group == 'Others'].iloc[0].commit_per_user) - 1) print("Capped commit per user above 11 top 10 lift", (g[g.quality_group == 'Top 10'].iloc[0].users_above_11_500_cap_per_above11_users / g[g.quality_group == 'Others'].iloc[0].users_above_11_500_cap_per_above11_users) - 1) return g
def compute_lang_anova(major_extensions_file): ext = pd.read_csv(major_extensions_file) dominant = ext[ext.major_extension_ratio > DOMINANT_RATE] trep = get_valid_repos() major = pd.merge(trep, dominant, on='repo_name') print("projects with a ", DOMINANT_RATE, " dominant extension" , len(major[major.major_extension_ratio > DOMINANT_RATE])) ccp_cpp = major[major.major_extension =='.cpp'].y2019_ccp.tolist() ccp_cs = major[major.major_extension =='.cs'].y2019_ccp.tolist() ccp_java = major[major.major_extension =='.java'].y2019_ccp.tolist() ccp_js = major[major.major_extension =='.js'].y2019_ccp.tolist() ccp_php = major[major.major_extension =='.php'].y2019_ccp.tolist() ccp_py = major[major.major_extension =='.py'].y2019_ccp.tolist() ccp_sh = major[major.major_extension =='.sh'].y2019_ccp.tolist() print(stats.f_oneway(ccp_cpp,ccp_cs,ccp_java, ccp_js,ccp_php,ccp_py, ccp_sh))
def describe_repos(repos_file, bq_propeties_file, git_propeties_file): print("################## Describing repositories ##################") df = pd.read_csv(bq_propeties_file) df = df[df.commit2019 > 199] print("Large active repositories", '{:,}'.format(df.repo_name.nunique())) git_repos = pd.read_csv(git_propeties_file) git_repos = pd.merge(git_repos, df, on='repo_name') print("BQ Large non fork repositories", '{:,}'.format(git_repos[~git_repos.fork].repo_name.nunique())) repos = pd.read_csv(repos_file) print("Large no reduendent repositories", '{:,}'.format(repos[~repos.fork].repo_name.nunique())) trep = get_valid_repos() print("Valid non reduendent repositories", '{:,}'.format(trep.repo_name.nunique()))
def run_generate_bins(): df = get_valid_repos() pair_analysis_by_bins_to_file(df, 'y2019_ccp', 'stargazers_count', output_file=os.path.join( DATA_PATH, 'stars_by_ccp_bins.csv'), bins=10) pair_analysis_by_bins_to_file(df, 'y2019_ccp', 'authors', output_file=os.path.join( DATA_PATH, 'authors_by_ccp_bins.csv'), bins=10) pair_analysis_by_bins_to_file(df, 'y2019_ccp', 'start_year', output_file=os.path.join( DATA_PATH, 'start_year_by_ccp_bins.csv'), bins=10)
def developer_num_analysis(): trep = get_valid_repos() print("Authors & ccp correlation" , trep.corr()['authors']['y2019_ccp']) print("CCP for the first num of developers") print(trep.groupby(['authors'], as_index=False).agg({'repo_name' : 'count', 'y2019_ccp' : 'mean'})[:20]) print(trep.authors.describe()) q25 = trep.authors.quantile(0.25) print("q25", q25) print(trep[(trep.authors < q25)].agg({'repo_name' : 'count', 'y2019_ccp' : 'mean'})) q75 = trep.authors.quantile(0.75) print("q75", q75) print(trep[(trep.authors> q25) & (trep.authors < q75)].agg({'repo_name' : 'count', 'y2019_ccp' : 'mean'})) print("above q50") print(trep[(trep.authors > q75)].agg({'repo_name' : 'count', 'y2019_ccp' : 'mean'})) q99 = trep.authors.quantile(0.99) print("q99", q99) print(trep[(trep.authors > q99)].agg({'repo_name' : 'count', 'y2019_ccp' : 'mean'}))
def coupling_ccp_cochange(repo_file_quality_per_year, repo_file_coupling_per_year): """ Note that repo_file_quality_per_year uses bug hit ratio and not ccp. For change analysis it doesn't matter. :param repo_file_quality_per_year: :return: """ key = 'repo_name' repo_file_quality_per_year_df = build_repo_per_year_df( repo_file_quality_per_year, key=key) repo_file_coupling_per_year_df = build_repo_per_year_df( repo_file_coupling_per_year, key=key) per_year_df = pd.merge(repo_file_quality_per_year_df, repo_file_coupling_per_year_df, on=[key, 'year']) repos = get_valid_repos() per_year_df = pd.merge(per_year_df, repos, on=[key]) cochange_analysis(per_year_df, first_metric='corrective_commits_ratio', second_metric='avg_capped_files', first_the_higher_the_better=False, second_the_higher_the_better=False, first_sig_threshold=0.1, second_sig_threshold=1, key=key) cochange_with_control(per_year_df, first_metric='corrective_commits_ratio', second_metric='avg_capped_files', first_the_higher_the_better=False, second_the_higher_the_better=False, first_sig_threshold=0.1, second_sig_threshold=1, key=key)
def plot_longevity(repo_properties_file, longevity_file): """ Longevity is on 2018 porjects, which are in a different file and therfore get a different function. """ repos = pd.read_csv(repo_properties_file) longevity = pd.read_csv(longevity_file) df = pd.merge(repos, longevity, on='repo_name', how='left') df = df[(df.fork == False) & (df.y2018_ccp > 0) & (df.y2018_ccp < 1)] df['after_2019_end'] = df.days_from_2019_end.map(lambda x: 1 if x > 0 else 0) grouping_column = 'y2018_ccp_10bins' repos_2019 = get_valid_repos() bins = 10 cuts = [0.0] + [ repos_2019['y2019_ccp'].quantile((1.0 / bins) * i) for i in range(1, bins) ] + [1.0] df[grouping_column] = pd.cut(df['y2018_ccp'], cuts) """ bin_metric_by_quantiles(df , 'y2018_ccp' , grouping_column , bins=10 ) """ df = df.sort_values(grouping_column) plot_deciles(df=df, grouping_column=grouping_column, metric_column='after_2019_end', title='Longevity by CCP', xaxis_title='CCP deciles', output_file=os.path.join(FIGURES_PATH, 'longevity.png'))
def speed_consistency(commits_per_user_file): trep = get_valid_repos() users_per_project = pd.read_csv(commits_per_user_file) users_per_project = users_per_project[users_per_project.repo_name.isin( trep.repo_name.unique())] users_per_project_cur = users_per_project[users_per_project.year == ANALYZED_YEAR].copy() users_per_project_cur = users_per_project_cur.rename( columns={ 'users': 'cur_users', 'commits': 'cur_commits', 'users_above_11': 'cur_users_above_11', 'users_above_11_commits_per_above11_users': 'cur_users_above_11_commits_per_above11_users', 'users_capped_commit': 'cur_users_capped_commit', 'users_above_11_500_cap_per_above11_users': 'cur_users_above_11_500_cap_per_above11_users' }) # commit_per_user users_per_project_cur[ 'cur_commit_per_user'] = users_per_project_cur.cur_commits / users_per_project_cur.cur_users # users_capped_commit_per_user users_per_project_cur[ 'cur_users_capped_commit_per_user'] = users_per_project_cur.cur_users_capped_commit / users_per_project_cur.cur_users users_per_project_prev = users_per_project[users_per_project.year == ( ANALYZED_YEAR - 1)].copy() users_per_project_prev = users_per_project_prev.rename( columns={ 'users': 'prev_users', 'commits': 'prev_commits', 'users_above_11': 'prev_users_above_11', 'users_above_11_commits_per_above11_users': 'prev_users_above_11_commits_per_above11_users', 'users_capped_commit': 'prev_users_capped_commit', 'users_above_11_500_cap_per_above11_users': 'prev_users_above_11_500_cap_per_above11_users' }) # commit_per_user users_per_project_prev[ 'prev_commit_per_user'] = users_per_project_prev.prev_commits / users_per_project_prev.prev_users # users_capped_commit_per_user users_per_project_prev['prev_users_capped_commit_per_user'] = ( users_per_project_prev.prev_users_capped_commit / users_per_project_prev.prev_users) upp_adjacent = pd.merge(users_per_project_cur, users_per_project_prev, on='repo_name') print("Users Pearson", upp_adjacent.corr()['cur_users']['prev_users']) print("Users above 11 Pearson", upp_adjacent.corr()['cur_users_above_11']['prev_users_above_11']) print("Commits Pearson", upp_adjacent.corr()['cur_commits']['prev_commits']) print( "Capped commits Pearson", upp_adjacent.corr()['cur_users_capped_commit'] ['prev_users_capped_commit']) print("Commits per user Pearson", upp_adjacent.corr()['cur_commit_per_user']['prev_commit_per_user']) print( "Commits per user above 11 Pearson", upp_adjacent.corr()['cur_users_above_11_commits_per_above11_users'] ['prev_users_above_11_commits_per_above11_users']) print( "Capped commits per user Pearson", upp_adjacent.corr()['cur_users_capped_commit_per_user'] ['prev_users_capped_commit_per_user']) print( "Capped commits, above 11 Pearson", upp_adjacent.corr()['cur_users_above_11_500_cap_per_above11_users'] ['prev_users_above_11_500_cap_per_above11_users']) return upp_adjacent
def length_per_lang_figure(major_extensions_file, image_file): ext = pd.read_csv(major_extensions_file) dominant = ext[ext.major_extension_ratio > DOMINANT_RATE] trep = get_valid_repos() main = pd.merge(trep, dominant, on='repo_name') agg = main.groupby(['major_extension', 'quality_group'], as_index=False).agg({ 'major_capped_avg_file': {'mean', 'std'}, 'repo_name': 'count' }) agg.columns = [ u'langauge', u'quality_group', u'size_std', u'size_mean', u'projects' ] print(agg) top_size_mean = [] top_size_std_err = [] other_size_mean = [] other_size_std_err = [] for i in language_extensions: top_size_mean.append( round(agg[(agg.langauge == i) & (agg.quality_group == 'Top 10')].iloc[0].size_mean)) top_size_std_err.append( round( agg[(agg.langauge == i) & (agg.quality_group == 'Top 10')].iloc[0].size_std / sqrt(agg[(agg.langauge == i) & (agg.quality_group == 'Top 10')].iloc[0].projects))) other_size_mean.append( round(agg[(agg.langauge == i) & (agg.quality_group == 'Others')].iloc[0].size_mean)) other_size_std_err.append( round( agg[(agg.langauge == i) & (agg.quality_group == 'Others')].iloc[0].size_std / sqrt(agg[(agg.langauge == i) & (agg.quality_group == 'Others')].iloc[0].projects))) trace1 = go.Bar(x=lang_name, y=top_size_mean, name='Top Length', error_y=dict(type='data', array=top_size_std_err, visible=True)) trace2 = go.Bar(x=lang_name, y=other_size_mean, name='Other Length', error_y=dict(type='data', array=other_size_std_err, visible=True)) data = [trace1, trace2] #layout = go.Layout( # barmode='group' #) layout = go.Layout( barmode='group', title='File length per language', xaxis=dict(title='Language', titlefont=dict(family='Courier New, monospace', size=18, color='#7f7f7f')), yaxis=dict(title='Avgerage file length', titlefont=dict(family='Courier New, monospace', size=18, color='#7f7f7f'))) fig = go.Figure(data=data, layout=layout) plot(fig, image='png', image_filename=image_file, output_type='file')
def file_size_analysis(major_extensions_file): trep = get_valid_repos() rep_size = pd.read_csv(major_extensions_file) print('avg file mean', rep_size.avg_size.mean() / KILOBYTE) print('std file mean', rep_size.std_size.mean() / KILOBYTE) print('avg capped file mean', rep_size.capped_avg_file.mean() / KILOBYTE) print('std capped file mean', rep_size.capped_std_file.mean() / KILOBYTE) print('avg capped file mean', rep_size.capped_avg_file.mean() / KILOBYTE) print('std capped file mean/avg capped file mean', rep_size.capped_std_file.mean() / rep_size.capped_avg_file.mean()) treps = pd.merge(trep, rep_size, on='repo_name') print(rep_size.capped_avg_file.describe()) size_25_q = rep_size.capped_avg_file.quantile(0.25) print("size 25 quantile", size_25_q, "in kb", size_25_q / KILOBYTE) size_75_q = rep_size.capped_avg_file.quantile(0.75) print("size 75 quantile", size_75_q, "in kb", size_75_q / KILOBYTE) treps['size_group'] = treps.apply( lambda x: 'Lower 25' if x.capped_avg_file < size_25_q else "top 25" if x.capped_avg_file > size_75_q else "Middle", axis=1) print('top 10 prob', 1.0 * len(treps[treps.quality_group == 'Top 10']) / len(treps)) top_10_in_l25 = 1.0 * len(treps[(treps.quality_group == 'Top 10') & (treps.size_group == 'Lower 25')]) / len( treps[treps.size_group == 'Lower 25']) print('top 10 prob in lower 25', top_10_in_l25) top_10_in_t25 = 1.0 * len(treps[(treps.quality_group == 'Top 10') & (treps.size_group == 'top 25')]) / len( treps[treps.size_group == 'top 25']) print('top 10 prob in top 25', top_10_in_t25) print("short files lift ", top_10_in_l25 / top_10_in_t25 - 1) group_by_size = treps.groupby(['size_group'], as_index=False).agg({'y2019_ccp': 'mean'}) print(group_by_size) print("all files") print( treps.groupby('quality_group').agg({ 'capped_avg_file': 'mean', 'avg_size': 'mean', 'files': 'sum', 'repo_name': 'count' })) for i in lang_name: print(i, " files") print(treps[(treps.major_extension_ratio > DOMINANT_RATE) & (treps.major_extension == lang_extension[i])].groupby( 'quality_group').agg({ 'capped_avg_file': 'mean', 'avg_size': 'mean', 'files': 'sum', 'repo_name': 'count' })) print("Size controled by developer groups") pretty_print( pair_analysis_by_dev_num_group(treps, 'size_group', 'y2019_ccp')) print("Size controled by project age") pretty_print(pair_analysis_by_age_group(treps, 'size_group', 'y2019_ccp')) scatter(treps, first_metric='y2019_ccp', second_metric='capped_avg_file', output_file=os.path.join(FIGURES_PATH, r'ccp_vs_length_scatter.html'), mode='markers', opacity=0.9) pair_analysis_by_bins_to_file(treps, 'y2019_ccp', 'capped_avg_file', output_file=os.path.join( DATA_PATH, 'ccp_vs_length_bins.csv'), bins=10) return treps
def build_repo_ccp_dist(): rep = get_non_fork_repos() trep = get_valid_repos() rep = rep.sort_values(['y2019_hit_rate'], ascending=False) trep = trep.sort_values(['y2019_hit_rate'], ascending=False) rep.groupby(['y2019_hit_rate_rnd']).agg({'repo_name': 'count'}) g = rep.groupby(['y2019_hit_rate_rnd']).agg({'repo_name': 'count'}) num_of_repos = len(rep) num_of_repos_in_range = len(trep) print() print(r"\begin{table}\centering") print(r"\caption{\label{tab:CCP-distrib}") print(r"CCP distribution in active GitHub projects}") print(r"\begin {tabular}{ | c | c | c | c | c |}") print(r"\hline") print( r"& \multicolumn {2} {c |} {Full data set} & \multicolumn {2} {c |} {CCP $\ in [0, 1]$}\\ " ) print(r"& \multicolumn {2} {c |} {(", f'{num_of_repos:,}', "projects)} &\multicolumn") print(r"{2}{c |}{(", f'{num_of_repos_in_range:,}', r"projects)}\\ \cline {2 - 5}") print( r"Percentile & Hit rate & CCP est. & Hit rate & CCP est. \\ \hline") vals = [1.0 * i / 10 for i in range(1, 10)] vals.append(0.95) #vals.append(0.99) for i in vals: print( str(int(100 * i)) + " & ", str(round(rep.iloc[int(i * len(rep))].y2019_hit_rate, 2)), " & " + str(round(rep.iloc[int(i * len(rep))].y2019_ccp, 2)), " & " + str(round(trep.iloc[int(i * len(trep))].y2019_hit_rate, 2)), " & " + str(round(trep.iloc[int(i * len(trep))].y2019_ccp, 2)) + " \\\\ \hline") print(r"\end{tabular}") print(r"\end{table}") print() # For manual verification # print (rep.y2019_hit_rate_rnd.value_counts(normalize=True).sort_index().cumsum()) # print(trep.y2019_hit_rate_rnd.value_counts(normalize=True).sort_index().cumsum()) print("correlation between years") print("commits", trep.corr()[u'y2019_commits'][u'y2018_commits']) print("hits", trep.corr()[u'y2019_hits'][u'y2018_hits']) print("hit ratio", trep.corr()[u'y2019_hit_rate'][u'y2018_hit_rate']) print("ccp", trep.corr()[u'y2019_ccp'][u'y2018_ccp']) print() y2019_hit_rate_median = trep.iloc[int(len(trep) / 2)].y2019_hit_rate #print("y2018_hit_rate_median", y2018_hit_rate_median) trep['high_half_2019'] = trep.y2019_hit_rate.map( lambda x: x > y2019_hit_rate_median) trep['high_half_2018'] = trep.y2018_hit_rate.map( lambda x: x > y2019_hit_rate_median) g = trep.groupby(['high_half_2019', 'high_half_2018'], as_index=False).agg({'repo_name': 'count'}) g = g.rename(columns={'repo_name': 'cnt'}) print( "stable half", 1.0 * g[((g.high_half_2019 == False) & (g.high_half_2018 == False)) | ((g.high_half_2019 == True) & (g.high_half_2018 == True))].cnt.sum() / len(trep)) y2019_hit_rate_10p = trep.iloc[int(90 * len(trep) / 100)].y2019_hit_rate #print("y2018_hit_rate_10p", y2018_hit_rate_10p) trep['high_10_2018'] = trep.y2018_hit_rate.map( lambda x: x < y2019_hit_rate_10p) trep['high_10_2019'] = trep.y2019_hit_rate.map( lambda x: x < y2019_hit_rate_10p) g = trep.groupby(['high_10_2018', 'high_10_2019'], as_index=False).agg({'repo_name': 'count'}) g = g.rename(columns={'repo_name': 'cnt'}) print( "stable 10", 1.0 * g[((g.high_10_2018 == False) & (g.high_10_2019 == False)) | ((g.high_10_2018 == True) & (g.high_10_2019 == True))].cnt.sum() / len(trep)) print( "stay in top", 1.0 * g[((g.high_10_2018 == True) & (g.high_10_2019 == True))].iloc[0].cnt / g[g.high_10_2018 == True].cnt.sum()) print( "get to top", 1.0 * g[((g.high_10_2018 == False) & (g.high_10_2019 == True))].iloc[0].cnt / g[g.high_10_2018 == False].cnt.sum()) trep['cpp_abs_diff'] = trep.apply( lambda x: round(abs(x.y2019_ccp - x.y2018_ccp), 2), axis=1) trep.cpp_abs_diff.value_counts(normalize=True).sort_index().cumsum() print("abs difference mean", trep.cpp_abs_diff.mean()) print() trep['cpp_diff'] = trep.apply( lambda x: round(x.y2019_ccp - x.y2018_ccp, 2), axis=1) trep.cpp_diff.value_counts(normalize=True).sort_index().cumsum() print("difference mean", trep.cpp_diff.mean()) print("2018 average ccp ratio", trep.y2018_hits.sum() * 1.0 / trep.y2018_commits.sum()) print("2019 average ccp ratio", trep.y2019_hits.sum() * 1.0 / trep.y2019_commits.sum()) print() print("CCP ratios") y2019_ccp_90p = trep.iloc[int(90 * len(trep) / 100)].y2019_ccp y2019_ccp_50p = trep.iloc[int(50 * len(trep) / 100)].y2019_ccp y2019_ccp_10p = trep.iloc[int(10 * len(trep) / 100)].y2019_ccp print("2019 top 90 CCP ", round(y2019_ccp_90p, 2)) print("2019 top 50 CCP ", round(y2019_ccp_50p, 2)) print("2019 top 10 CCP (worse) ", round(y2019_ccp_10p, 2)) print("2019 top 50 CCP over top 90", round(y2019_ccp_50p / y2019_ccp_90p, 2)) print("2019 top 10 CCP over top 90", round(y2019_ccp_10p / y2019_ccp_90p, 2))
def file_length_per_language(major_extensions_file, commits_per_user_file, image_file): ext = pd.read_csv(major_extensions_file) dominant = ext[ext.major_extension_ratio > DOMINANT_RATE] trep = get_valid_repos() major = pd.merge(trep, dominant, left_on='repo_name', right_on='repo_name') users_per_project = pd.read_csv(commits_per_user_file) users_per_project = users_per_project[users_per_project.year == 2019] trepu = pd.merge(major, users_per_project, on='repo_name') trepu['commit_per_user'] = trepu.apply(lambda x: x.y2019_commits / x.users if x.users > 0 else None, axis=1) trepu['commit_per_user_above_11'] = trepu.apply( lambda x: x.users_above_11_commits / x.users_above_11 if x.users_above_11 > 0 else None, axis=1) trepu['commit_per_user_cap'] = trepu.apply( lambda x: x.users_capped_commit / x.users if x.users > 0 else None, axis=1) trepu['commit_per_user_above_11_cap'] = trepu.apply( lambda x: x.commits_above_11_500_cap / x.users_above_11 if x.users_above_11 > 0 else None, axis=1) agg_lang = trepu[trepu.major_extension.isin(language_extensions)].groupby( 'major_extension', as_index=False).agg({ 'repo_name': 'count', 'y2019_ccp': {'mean', 'std'}, 'commit_per_user_above_11_cap': {'mean', 'std'} }) agg_lang.columns = agg_lang.columns.droplevel() agg_lang.columns = [ u'langauge', u'projects', u'ccp_mean', u'ccp_std', u'speed_mean', u'speed_std' ] agg_lang_quality = trepu[trepu.major_extension.isin( language_extensions)].groupby(['major_extension', 'quality_group'], as_index=False).agg({ 'repo_name': 'count', 'commit_per_user_above_11_cap': {'mean', 'std'} }) agg_lang_quality.columns = agg_lang_quality.columns.droplevel() """ agg_lang_quality = agg_lang_quality.rename(columns={ 'major_extension' : u'langauge' , 'std': u'speed_std' , 'mean': u'speed_mean' , 'count': u'projects' }) """ agg_lang_quality.columns = [ u'langauge', u'quality_group', u'projects', u'speed_mean', u'speed_std' ] all_speed_mean = [] all_speed_std = [] top_speed_mean = [] top_speed_std = [] other_speed_mean = [] other_speed_std = [] ccp_mean = [] ccp_std = [] for i in language_extensions: top_speed_mean.append( round( agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Top 10' )].iloc[0].speed_mean)) top_speed_std.append( round( agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Top 10' )].iloc[0].speed_std / math.sqrt(agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Top 10')].iloc[0].projects))) other_speed_mean.append( round( agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Others' )].iloc[0].speed_mean)) other_speed_std.append( round( agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Others' )].iloc[0].speed_std / math.sqrt(agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Others')].iloc[0].projects))) ccp_mean.append( round(100 * agg_lang[(agg_lang.langauge == i)].iloc[0].ccp_mean)) ccp_std.append(100 * round( agg_lang[(agg_lang.langauge == i)].iloc[0].ccp_std / math.sqrt(agg_lang[(agg_lang.langauge == i)].iloc[0].projects))) all_speed_mean.append( round(agg_lang[(agg_lang.langauge == i)].iloc[0].speed_mean)) all_speed_std.append( round(agg_lang[(agg_lang.langauge == i)].iloc[0].speed_std / math.sqrt(agg_lang[(agg_lang.langauge == i)].iloc[0].projects))) trace0 = go.Bar(x=lang_name, y=all_speed_mean, name='Speed', error_y=dict(type='data', array=all_speed_std, visible=True)) trace1 = go.Bar(x=lang_name, y=top_speed_mean, name='Top Speed', error_y=dict(type='data', array=top_speed_std, visible=True)) trace2 = go.Bar(x=lang_name, y=other_speed_mean, name='Other Speed', error_y=dict(type='data', array=other_speed_std, visible=True)) trace3 = go.Bar(x=lang_name, y=ccp_mean, name='CCP', error_y=dict(type='data', array=ccp_std, visible=True)) data = [trace0, trace1, trace2, trace3] layout = go.Layout( barmode='group', title='Speed and CCP per language', xaxis=dict(title='Language', titlefont=dict(family='Courier New, monospace', size=24, color='#7f7f7f')), yaxis=dict(title='Commit per developer, CCP', titlefont=dict(family='Courier New, monospace', size=24, color='#7f7f7f'))) fig = go.Figure(data=data, layout=layout) plot(fig, image='png', image_filename=image_file, output_type='file') print(r"\begin{tabular}{| l| l| l| l| l| l|}") print(r" \hline ") Title = r" Metric & Projects & CCP & Speed & Top Speed & Others Speed \\ \hline" print(Title) for i in agg_lang.sort_values('ccp_mean').langauge.tolist(): Line = str(lang_by_extension(i)) Line = Line + " & " + str(agg_lang[(agg_lang.langauge == i)].iloc[0].projects) Line = Line + " & " + str( round(1 * agg_lang[(agg_lang.langauge == i)].iloc[0].ccp_mean, 2)) Line = Line + " $\pm$ " + str( round( 1 * agg_lang[(agg_lang.langauge == i)].iloc[0].ccp_std / math.sqrt(agg_lang[(agg_lang.langauge == i)].iloc[0].projects), 3)) Line = Line + " & " + str( int(agg_lang[(agg_lang.langauge == i)].iloc[0].speed_mean)) Line = Line + " $\pm$ " + str( int(agg_lang[(agg_lang.langauge == i)].iloc[0].speed_std / math.sqrt(agg_lang[ (agg_lang.langauge == i)].iloc[0].projects))) Line = Line + " & " + str( int(agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Top 10' )].iloc[0].speed_mean)) Line = Line + " $\pm$ " + str( int(agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Top 10' )].iloc[0].speed_std / math.sqrt(agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Top 10')].iloc[0].projects))) Line = Line + " & " + str( int(agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Others' )].iloc[0].speed_mean)) Line = Line + " $\pm$ " + str( int(agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Others' )].iloc[0].speed_std / math.sqrt(agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Others')].iloc[0].projects))) Line = Line + r" \\ \hline" print(Line) scatter(trepu, first_metric='y2019_ccp', second_metric='commit_per_user_above_11_cap', output_file=os.path.join(FIGURES_PATH, r'ccp_vs_speed_scatter.html'), mode='markers', opacity=0.9) pair_analysis_by_bins_to_file(trepu, 'y2019_ccp', 'commit_per_user_above_11_cap', output_file=os.path.join( DATA_PATH, 'ccp_vs_speed_bins.csv'), bins=10)
def ccp_cdf_per_language(major_extensions_file , image_file): ext = pd.read_csv(major_extensions_file) dominant = ext[ext.major_extension_ratio > DOMINANT_RATE] print("Number of repositories with a dominant extension above" , DOMINANT_RATE , " is " , len(dominant)) trep = get_valid_repos() major = pd.merge(trep, dominant, left_on='repo_name', right_on='repo_name') trepu = major cdfs = {} traces = [] for i in lang_name: cdf = trepu[trepu.major_extension == lang_extension[i]].y2019_ccp.value_counts(normalize=True).sort_index().cumsum() cdf = pd.DataFrame(cdf) cdf = cdf.reset_index() cdf.columns = ['ccp', 'cdf'] cdf = cdf[cdf.ccp < LIMIT] cdfs[i] = cdf traces.append(go.Scatter( x=cdfs[i].ccp, y=cdfs[i].cdf, mode='lines', name=i )) layout = go.Layout( title='CDF of CCP for common languages', xaxis=dict( title='CCP', titlefont=dict( family='Courier New, monospace', size=24, color='#7f7f7f' ) ), yaxis=dict( title='CDF of projects CCP', titlefont=dict( family='Courier New, monospace', size=24, color='#7f7f7f' ) ) ) fig = go.Figure(data=traces , layout=layout) plot(fig , image='png' , image_filename=image_file , output_type='file' , image_width=800, image_height=400 ) #plot(fig) fig.write_image(image_file)
def run_star_analysis(): trep = get_valid_repos() do_stars_analysis(trep) Linus_rule()
def quality_and_speed_over_years(commits_per_user_file): print("over the years ccp and speed change") trep = get_valid_repos() trep = trep[['repo_name']] users_per_project = pd.read_csv(commits_per_user_file) users_per_project = users_per_project[users_per_project.year > 2014] df = pd.merge(users_per_project, trep, on='repo_name') df = df[[ 'repo_name', 'year', 'corrective_commits_ratio', 'commits_per_above11_users' ]] df = df.dropna() cur_df = df.copy() cur_df['prev_year'] = cur_df.year - 1 cur_df = cur_df.rename( columns={ 'year': 'cur_year', 'corrective_commits_ratio': 'cur_corrective_commits_ratio', 'commits_per_above11_users': 'cur_commits_per_above11_users' }) prev_df = df.copy() prev_df = prev_df.rename( columns={ 'year': 'prev_year', 'corrective_commits_ratio': 'prev_corrective_commits_ratio', 'commits_per_above11_users': 'prev_commits_per_above11_users' }) two_years = pd.merge(cur_df, prev_df, left_on=['repo_name', 'prev_year'], right_on=['repo_name', 'prev_year']) two_years[ 'improved_ccp'] = two_years.cur_corrective_commits_ratio < two_years.prev_corrective_commits_ratio two_years[ 'hurt_ccp'] = two_years.cur_corrective_commits_ratio > two_years.prev_corrective_commits_ratio two_years[ 'improved_speed'] = two_years.cur_commits_per_above11_users > two_years.prev_commits_per_above11_users g = two_years.groupby(['improved_ccp', 'improved_speed'], as_index=False).agg({'repo_name': 'count'}) print(g) cm = ConfusionMatrix(g_df=g, classifier='improved_ccp', concept='improved_speed', count='repo_name') print(cm.summarize()) print("speed & ccp improvement match", cm.accuracy()) print("speed improvement given ccp improvement", cm.precision()) print("ccp improvement given speed improvement", cm.tp() / (cm.fn() + cm.tp())) two_years[ 'sig_improved_ccp'] = two_years.cur_corrective_commits_ratio < two_years.prev_corrective_commits_ratio - 0.1 two_years[ 'sig_improved_speed'] = two_years.cur_commits_per_above11_users > two_years.prev_commits_per_above11_users + 10 g = two_years.groupby(['sig_improved_ccp', 'sig_improved_speed'], as_index=False).agg({'repo_name': 'count'}) print(g) cm = ConfusionMatrix(g_df=g, classifier='sig_improved_ccp', concept='sig_improved_speed', count='repo_name') print(cm.summarize()) g = two_years.groupby(['sig_improved_ccp', 'improved_speed'], as_index=False).agg({'repo_name': 'count'}) print(g) print(cm.summarize()) print() print("speed & ccp improvement match", cm.accuracy()) print("speed improvement given ccp improvement", cm.precision(), "lift", cm.precision_lift()) print("ccp improvement given speed improvement", cm.recall(), "lift", cm.recall() / cm.hit_rate() - 1) print() g = two_years.groupby(['sig_improved_speed', 'hurt_ccp'], as_index=False).agg({'repo_name': 'count'}) cm = ConfusionMatrix(g_df=g, classifier='sig_improved_speed', concept='hurt_ccp', count='repo_name') print(cm.summarize()) print() print("ccp hurt given significant speed improvement", cm.precision(), "lift", cm.precision_lift()) print()
import os import pandas as pd import plotly import plotly.graph_objects as go from analysis_configuration import EARLIEST_ANALYZED_YEAR from configuration import FIGURES_PATH from repo_utils import get_valid_repos df = get_valid_repos() def repos_by_lang(): g_by_lang = df.groupby(['language'], as_index=False).agg({'repo_name' : 'nunique' , 'commits' : 'mean' }) g_by_lang = g_by_lang[g_by_lang.repo_name > 9] g_by_lang = g_by_lang.sort_values(['repo_name', 'language'], ascending=[True, True]) graphs = [ go.Bar(x=g_by_lang['language'], y=g_by_lang['repo_name'], name='repos') , go.Bar(x=g_by_lang['language'], y=g_by_lang['commits'], name='commits') ] fig = go.Figure(data=graphs) fig.update_layout( title=go.layout.Title( text="Repositories by language", xref="paper", x=0 ), xaxis=go.layout.XAxis(
def decile_analysis(major_extensions_file, coupling_file, commits_per_user_file, churn_file, onboarding_file, reuse_file, output_file): repos = get_valid_repos() repos = repos.rename(columns={'commits': 'repo_all_commits'}) bin_metric_by_quantiles(repos, 'y2019_ccp', 'y2019_ccp_10bins', bins=10) # File length rep_size = pd.read_csv(major_extensions_file) df = pd.merge(repos, rep_size, on='repo_name', how='left') df['Capped_Length_KB'] = df.capped_avg_file / KILOBYTE # Coupling coupling_size = pd.read_csv(coupling_file) coupling_size = coupling_size[coupling_size.year == ANALYZED_YEAR] df = pd.merge(df, coupling_size, on='repo_name', how='left') df['Commit_Size_Capped'] = df['avg_capped_files'] users_per_project = pd.read_csv(commits_per_user_file) users_per_project = users_per_project[users_per_project.year == ANALYZED_YEAR] df = pd.merge(df, users_per_project, on='repo_name', how='left') df['commit_per_user'] = df.apply(lambda x: x.y2019_commits / x.users if x.users > 0 else None, axis=1) df['commit_per_user_above_11'] = df.apply( lambda x: x.users_above_11_commits / x.users_above_11 if x.users_above_11 > 0 else None, axis=1) df['commit_per_user_cap'] = df.apply( lambda x: x.users_capped_commit / x.users if x.users > 0 else None, axis=1) df['Commit_Per_Involved_User_Cappped'] = df.apply( lambda x: x.commits_above_11_500_cap / x.users_above_11 if x.users_above_11 > 0 else None, axis=1) # print(df.groupby(['y2019_ccp_10bins']).agg({'Commit_Per_Involved_User_Cappped' : 'mean', 'repo_name' : 'count'}).sort_index()) df['repo_all_commits_log10'] = df.repo_all_commits.map(lambda x: log10(x) if x > 0 else x) df['authors_log10'] = df.authors.map(lambda x: log10(x) if x > 0 else x) df['stargazers_count_log10'] = df.stargazers_count.map(lambda x: log10(x) if x > 0 else x) churn = pd.read_csv(churn_file) churn = churn[churn.year == ANALYZED_YEAR - 1] df = pd.merge(df, churn, on='repo_name', how='left') df['churn'] = 1.0 - df['continuing_developers_ratio'] onboarding = pd.read_csv(onboarding_file) onboarding = onboarding[onboarding.year == ANALYZED_YEAR] df = pd.merge(df, onboarding, on='repo_name', how='left') df['Onboarding'] = df.comming_involved_developers_ratio reuse = pd.read_csv(reuse_file) df = pd.merge(df, reuse, on='repo_name', how='left') aggregations = {i: 'mean' for i in metrics} aggregations['repo_name'] = 'count' g = df.groupby('y2019_ccp_10bins', as_index=False).agg(aggregations) g.to_csv(output_file) plot_all_metrics(df, grouping_column='y2019_ccp_10bins') plot_by_ccp_all_metrics(df, grouping_columns=[ 'Capped_Length_KB', 'Commit_Size_Capped', 'package_avg' ]) #plot_ccp_by_length_per_lang(df) return df