def bootstrap_diff(df, ccp_estimator, rounds, sample_size): bootstrap_results = [] for i in range(rounds): # Get first model parameters s1 = df.sample(sample_size, replace=True) bug_g = s1.groupby([classifier, concept], as_index=False).agg({count: 'count'}) bug_cm = ConfusionMatrix(g_df=bug_g, classifier=classifier, concept=concept, count=count) positive_rate = bug_cm.positive_rate() hit_rate = bug_cm.hit_rate() ccp = ccp_estimator.estimate_positives(hit_rate) ccp_diff = ccp - positive_rate # Find difference in given points bootstrap_results.append([positive_rate, hit_rate, ccp, ccp_diff]) if (i % 100 == 0): print("finished " + str(i), datetime.datetime.now()) results_df = pd.DataFrame( bootstrap_results, columns=['positive_rate', 'hit_rate', 'ccp', 'ccp_diff']) return results_df
def quality_and_speed_over_years(commits_per_user_file): print("over the years ccp and speed change") trep = get_valid_repos() trep = trep[['repo_name']] users_per_project = pd.read_csv(commits_per_user_file) users_per_project = users_per_project[users_per_project.year > 2014] df = pd.merge(users_per_project, trep, on='repo_name') df = df[[ 'repo_name', 'year', 'corrective_commits_ratio', 'commits_per_above11_users' ]] df = df.dropna() cur_df = df.copy() cur_df['prev_year'] = cur_df.year - 1 cur_df = cur_df.rename( columns={ 'year': 'cur_year', 'corrective_commits_ratio': 'cur_corrective_commits_ratio', 'commits_per_above11_users': 'cur_commits_per_above11_users' }) prev_df = df.copy() prev_df = prev_df.rename( columns={ 'year': 'prev_year', 'corrective_commits_ratio': 'prev_corrective_commits_ratio', 'commits_per_above11_users': 'prev_commits_per_above11_users' }) two_years = pd.merge(cur_df, prev_df, left_on=['repo_name', 'prev_year'], right_on=['repo_name', 'prev_year']) two_years[ 'improved_ccp'] = two_years.cur_corrective_commits_ratio < two_years.prev_corrective_commits_ratio two_years[ 'hurt_ccp'] = two_years.cur_corrective_commits_ratio > two_years.prev_corrective_commits_ratio two_years[ 'improved_speed'] = two_years.cur_commits_per_above11_users > two_years.prev_commits_per_above11_users g = two_years.groupby(['improved_ccp', 'improved_speed'], as_index=False).agg({'repo_name': 'count'}) print(g) cm = ConfusionMatrix(g_df=g, classifier='improved_ccp', concept='improved_speed', count='repo_name') print(cm.summarize()) print("speed & ccp improvement match", cm.accuracy()) print("speed improvement given ccp improvement", cm.precision()) print("ccp improvement given speed improvement", cm.tp() / (cm.fn() + cm.tp())) two_years[ 'sig_improved_ccp'] = two_years.cur_corrective_commits_ratio < two_years.prev_corrective_commits_ratio - 0.1 two_years[ 'sig_improved_speed'] = two_years.cur_commits_per_above11_users > two_years.prev_commits_per_above11_users + 10 g = two_years.groupby(['sig_improved_ccp', 'sig_improved_speed'], as_index=False).agg({'repo_name': 'count'}) print(g) cm = ConfusionMatrix(g_df=g, classifier='sig_improved_ccp', concept='sig_improved_speed', count='repo_name') print(cm.summarize()) g = two_years.groupby(['sig_improved_ccp', 'improved_speed'], as_index=False).agg({'repo_name': 'count'}) print(g) print(cm.summarize()) print() print("speed & ccp improvement match", cm.accuracy()) print("speed improvement given ccp improvement", cm.precision(), "lift", cm.precision_lift()) print("ccp improvement given speed improvement", cm.recall(), "lift", cm.recall() / cm.hit_rate() - 1) print() g = two_years.groupby(['sig_improved_speed', 'hurt_ccp'], as_index=False).agg({'repo_name': 'count'}) cm = ConfusionMatrix(g_df=g, classifier='sig_improved_speed', concept='hurt_ccp', count='repo_name') print(cm.summarize()) print() print("ccp hurt given significant speed improvement", cm.precision(), "lift", cm.precision_lift()) print()
def two_years_analysis(two_years_df , first_metric , second_metric , key): print() print("Co-change" , first_metric , second_metric) g = two_years_df.groupby([first_metric, second_metric] , as_index=False).agg({key : 'count'}) print(g) cm = ConfusionMatrix(g_df=g , classifier=first_metric , concept=second_metric, count=key) print(cm.summarize()) print() print("Samples", cm.samples()) print("Both metrics increment match", cm.accuracy()) print(second_metric , " improvement given " , first_metric , " improvement", cm.precision(), "lift", cm.precision_lift()) print(first_metric , " improvement given " , second_metric , "improvement", cm.recall(), "lift", ifnull(safe_divide(ifnull(cm.recall()),cm.hit_rate())) - 1) print()