def calculate_scores_for_prs(database, starting_pr_number, limit): # TODO ADD comments for all the scripts logging.basicConfig(level=logging.INFO, filename='app.log', format='%(name)s - %(levelname)s - %(message)s') df1 = pd.DataFrame() # Connection to MySQL database connection = pymysql.connect(host='localhost', port=3306, user='******', passwd='', db=database) try: with connection.cursor() as cursor: # Read records query1 = "SELECT * FROM pull_request LIMIT %s OFFSET %s" inputs = (limit, starting_pr_number) cursor.execute(query1, inputs) all_prs = cursor.fetchall() finally: connection.close() for new_pr in all_prs: new_pr = PullRequest(new_pr) row = calculate_scores(database, new_pr) df1 = df1.append(row, ignore_index=True) logging.info(new_pr.pr_id) print(new_pr.pr_id) df1.to_csv('pr_stats.csv', index=False) print(df1)
def get_related_integrators_for_pr_by_pr_number(self, pr_number): """ This function calculates scores for each factor for each integrator and provides a ranked data frame which includes top five integrators. :EXAMPLE: >>> interec.get_related_integrators_for_pr_by_pr_number(10) :param pr_number: PR id number :type pr_number: int :return: Top five integrators data frame :rtype: DataFrame """ logging.info("Getting related integrators by PR number for PR" + str(pr_number) + " started") pr_details = self.get_pr_details(pr_number) new_pr = PullRequest(pr_details) df = pd.DataFrame() df = self.__calculate_scores(df, new_pr, self.date_window) ranked_df = self.generate_ranked_list(df, self.alpha, self.beta, self.gamma) sorted_ranked_data_frame = ranked_df.sort_values('final_rank', ascending=True) ranked_five_df = sorted_ranked_data_frame[ sorted_ranked_data_frame['final_rank'] <= 5] logging.info("Top five integrators for PR " + str(pr_number) + " presented") return ranked_five_df
def get_related_integrators_for_pr(self, pr_number, requester_login, title, description, created_date_time, files): """ This function calculates scores for each factor for each integrator and provides a ranked data frame which includes top five integrators. :EXAMPLE: >>> interec.get_related_integrators_for_pr(10, 'John', 'PR Title', 'PR Description', '2019-03-10 17:52:31', 'abc.js|def.js|ghi.js') :param pr_number: PR id number :type pr_number: int :param requester_login: Contributor username :type requester_login: String :param title: Title of the PR :type title: String :param description: Description of the PR :type description: String :param created_date_time: PR created date and the time :type created_date_time: String :param files: File paths of the PR :type files: String :return: Top five integrators data frame :rtype: DataFrame """ logging.info("Getting related integrators by PR details for PR " + str(pr_number) + " started") created_date_time = datetime.strptime(created_date_time, '%Y-%m-%d %H:%M:%S') pr_data = [ 0, pr_number, requester_login, title, description, created_date_time, 0, " ", files ] new_pr = PullRequest(pr_data) df = pd.DataFrame() df = self.__calculate_scores(df, new_pr, self.date_window) ranked_df = self.generate_ranked_list(df, self.alpha, self.beta, self.gamma) sorted_ranked_data_frame = ranked_df.sort_values('final_rank', ascending=True) ranked_five_df = sorted_ranked_data_frame[ sorted_ranked_data_frame['final_rank'] <= 5] logging.info("Top five integrators for PR " + str(pr_number) + " presented") return ranked_five_df
def get_recommendation(column_name, required_integrators): # new_pr, logging.basicConfig(level=logging.INFO, filename='app.log', format='%(name)s - %(levelname)s - %(message)s') query_demo = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \ "integrator_login, files " \ "FROM pull_request " \ "WHERE pr_id = '2001'" pr_demo = spark.sql(query_demo) new_pr = pr_demo.collect()[0] new_pr = PullRequest(new_pr) print(new_pr.pr_id) ranked_data_frame = generate_ranked_list(new_pr) sorted_ranked_data_frame = ranked_data_frame.sort_values(column_name, ascending=True) recommended_integrators = sorted_ranked_data_frame[ sorted_ranked_data_frame[column_name] <= required_integrators] print("Position Integrator") print("-------------------------------") for row in recommended_integrators.itertuples(index=False): print(" " + str(row.final_rank) + " " + row.integrator)
def __calculate_scores_for_all_prs(self, offset, limit, date_window=120): query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \ "integrator_login, files " \ "FROM pull_request " \ "WHERE pr_id > '%s' and pr_id <= '%s' " \ "ORDER BY pr_id " \ "LIMIT %d" % (offset, offset + limit, limit) all_prs = self.spark.sql(query1) total_prs = 0 df = pd.DataFrame() for new_pr in all_prs.collect(): total_prs += 1 new_pr = PullRequest(new_pr) df = self.__calculate_scores(df, new_pr, date_window) print("Scores calculated for: " + str(date_window) + "_" + str(new_pr.pr_id)) logging.info("Scores calculated for: " + str(date_window) + "_" + str(new_pr.pr_id)) df.to_csv(str(date_window) + "_" + self.database + "_all_integrator_scores_for_each_test_pr.csv", index=False) return df
def calculate_scores(new_pr): df1 = pd.DataFrame() # Calculate scores for each integrator for integrator in all_integrators: pr_integrator = Integrator(integrator[1]) # Read all the PRs integrator reviewed before query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \ "integrator_login, files " \ "FROM pull_request " \ "WHERE merged_date < timestamp('%s') AND integrator_login = '******'" % \ (new_pr.created_date, pr_integrator.integrator_login) integrator_reviewed_prs = spark.sql(query1).collect() print(len(integrator_reviewed_prs)) # TODO:Remove this for integrator_reviewed_pr in integrator_reviewed_prs: old_pr = PullRequest(integrator_reviewed_pr) old_pr_file_paths = old_pr.files # Calculate file path similarity for new_pr_file_path in new_pr.files: for file_path in old_pr_file_paths: number_of_file_combinations = len(old_pr_file_paths) * len( new_pr.files) max_file_path_length = max( len(new_pr_file_path.split("/")), len(file_path.split("/"))) divider = max_file_path_length * number_of_file_combinations pr_integrator.longest_common_prefix_score += \ (longest_common_prefix(new_pr_file_path, file_path) / divider) pr_integrator.longest_common_suffix_score += \ (longest_common_suffix(new_pr_file_path, file_path) / divider) pr_integrator.longest_common_sub_string_score += \ (longest_common_sub_string(new_pr_file_path, file_path) / divider) pr_integrator.longest_common_sub_sequence_score += \ (longest_common_sub_sequence(new_pr_file_path, file_path) / divider) # Calculate cosine similarity of title pr_integrator.pr_title_similarity += cos_similarity( new_pr.title, old_pr.title) # Calculate cosine similarity of description if new_pr.description != "" and old_pr.description != "": pr_integrator.pr_description_similarity += cos_similarity( new_pr.description, old_pr.description) # Calculate activeness of the integrator pr_integrator.activeness += calculate_integrator_activeness( new_pr, old_pr) row = { 'integrator': pr_integrator.integrator_login, 'lcp': pr_integrator.longest_common_prefix_score, 'lcs': pr_integrator.longest_common_suffix_score, 'lc_substr': pr_integrator.longest_common_sub_string_score, 'ls_subseq': pr_integrator.longest_common_sub_sequence_score, 'cos_title': pr_integrator.pr_title_similarity, 'cos_description': pr_integrator.pr_description_similarity, 'activeness': pr_integrator.activeness } df1 = df1.append(row, ignore_index=True) return df1
def test_accuracy_for_all_prs(offset, limit): logging.basicConfig(level=logging.INFO, filename='app.log', format='%(name)s - %(levelname)s - %(message)s') query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \ "integrator_login, files " \ "FROM pull_request " \ "WHERE pr_id > '%s' and pr_id <= '%s' " \ "ORDER BY pr_id " \ "LIMIT %d" % (offset, offset + limit, limit) all_prs = spark.sql(query1) total_prs = 0 cmb_accuracy_array = [0, 0, 0] file_accuracy_array = [0, 0, 0] txt_accuracy_array = [0, 0, 0] act_accuracy_array = [0, 0, 0] for new_pr in all_prs.collect(): total_prs += 1 new_pr = PullRequest(new_pr) print(new_pr.pr_id) # TODO: Remove this ranked_data_frame = generate_ranked_list(new_pr) combined_accuracy = test_combined_accuracy(ranked_data_frame, new_pr, True, True, True) file_path_accuracy = test_file_path_similarity_accuracy( ranked_data_frame, new_pr, True, True, True) text_accuracy = test_text_similarity_accuracy(ranked_data_frame, new_pr, True, True, True) activeness_accuracy = test_activeness_accuracy(ranked_data_frame, new_pr, True, True, True) if hasattr(combined_accuracy, 'top1') and combined_accuracy.top1: cmb_accuracy_array[0] += 1 if hasattr(combined_accuracy, 'top3') and combined_accuracy.top3: cmb_accuracy_array[1] += 1 if hasattr(combined_accuracy, 'top5') and combined_accuracy.top5: cmb_accuracy_array[2] += 1 if hasattr(file_path_accuracy, 'top1') and file_path_accuracy.top1: file_accuracy_array[0] += 1 if hasattr(file_path_accuracy, 'top3') and file_path_accuracy.top3: file_accuracy_array[1] += 1 if hasattr(file_path_accuracy, 'top5') and file_path_accuracy.top5: file_accuracy_array[2] += 1 if hasattr(text_accuracy, 'top1') and text_accuracy.top1: txt_accuracy_array[0] += 1 if hasattr(text_accuracy, 'top3') and text_accuracy.top3: txt_accuracy_array[1] += 1 if hasattr(text_accuracy, 'top5') and text_accuracy.top5: txt_accuracy_array[2] += 1 if hasattr(activeness_accuracy, 'top1') and activeness_accuracy.top1: act_accuracy_array[0] += 1 if hasattr(activeness_accuracy, 'top3') and activeness_accuracy.top3: act_accuracy_array[1] += 1 if hasattr(activeness_accuracy, 'top5') and activeness_accuracy.top5: act_accuracy_array[2] += 1 avg_combined_top1_accuracy = cmb_accuracy_array[0] / total_prs avg_combined_top3_accuracy = cmb_accuracy_array[1] / total_prs avg_combined_top5_accuracy = cmb_accuracy_array[2] / total_prs avg_file_path_top1_accuracy = file_accuracy_array[0] / total_prs avg_file_path_top3_accuracy = file_accuracy_array[1] / total_prs avg_file_path_top5_accuracy = file_accuracy_array[2] / total_prs avg_text_top1_accuracy = txt_accuracy_array[0] / total_prs avg_text_top3_accuracy = txt_accuracy_array[1] / total_prs avg_text_top5_accuracy = txt_accuracy_array[2] / total_prs avg_act_top1_accuracy = act_accuracy_array[0] / total_prs avg_act_top3_accuracy = act_accuracy_array[1] / total_prs avg_act_top5_accuracy = act_accuracy_array[2] / total_prs print( "---------------------------------------------------------------------------" ) print(" Top1 Top3 Top5") print("Combined Accuracy " + str(avg_combined_top1_accuracy) + " " + str(avg_combined_top3_accuracy) + " " + str(avg_combined_top5_accuracy)) print("File Path Accuracy " + str(avg_file_path_top1_accuracy) + " " + str(avg_file_path_top3_accuracy) + " " + str(avg_file_path_top5_accuracy)) print("Text Accuracy " + str(avg_text_top1_accuracy) + " " + str(avg_text_top3_accuracy) + " " + str(avg_text_top5_accuracy)) print("Activeness Accuracy " + str(avg_act_top1_accuracy) + " " + str(avg_act_top3_accuracy) + " " + str(avg_act_top5_accuracy))
def calculate_scores(offset, limit): df = pd.DataFrame() logging.basicConfig(level=logging.INFO, filename='app.log', format='%(name)s - %(levelname)s - %(message)s') query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \ "integrator_login, files " \ "FROM pull_request " \ "WHERE pr_id > '%s' and pr_id <= '%s' " \ "ORDER BY pr_id " \ "LIMIT %d" % (offset, offset + limit, limit) all_prs = spark.sql(query1) for test_pr in all_prs.collect(): test_pr = PullRequest(test_pr) print(test_pr.pr_id) logging.info(test_pr.pr_id) pr_integrator = Integrator(test_pr.integrator_login) # Calculate scores for integrator # Read all the PRs integrator reviewed before query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \ "integrator_login, files " \ "FROM pull_request " \ "WHERE merged_date < timestamp('%s') AND integrator_login = '******'" % \ (test_pr.created_date, pr_integrator.integrator_login) integrator_reviewed_prs = spark.sql(query1).collect() for integrator_reviewed_pr in integrator_reviewed_prs: old_pr = PullRequest(integrator_reviewed_pr) old_pr_file_paths = old_pr.files # Calculate file path similarity for new_pr_file_path in test_pr.files: for file_path in old_pr_file_paths: number_of_file_combinations = len(old_pr_file_paths) * len( test_pr.files) max_file_path_length = max( len(new_pr_file_path.split("/")), len(file_path.split("/"))) divider = max_file_path_length * number_of_file_combinations pr_integrator.longest_common_prefix_score += \ (longest_common_prefix(new_pr_file_path, file_path) / divider) pr_integrator.longest_common_suffix_score += \ (longest_common_suffix(new_pr_file_path, file_path) / divider) pr_integrator.longest_common_sub_string_score += \ (longest_common_sub_string(new_pr_file_path, file_path) / divider) pr_integrator.longest_common_sub_sequence_score += \ (longest_common_sub_sequence(new_pr_file_path, file_path) / divider) # Calculate cosine similarity of title pr_integrator.pr_title_similarity += cos_similarity( test_pr.title, old_pr.title) # Calculate cosine similarity of description if test_pr.description != "" and old_pr.description != "": pr_integrator.pr_description_similarity += cos_similarity( test_pr.description, old_pr.description) # Calculate activeness of the integrator pr_integrator.activeness += calculate_integrator_activeness( test_pr, old_pr) row = { 'pr_id': test_pr.pr_id, 'integrator': pr_integrator.integrator_login, 'lcp': pr_integrator.longest_common_prefix_score, 'lcs': pr_integrator.longest_common_suffix_score, 'lc_substr': pr_integrator.longest_common_sub_string_score, 'ls_subseq': pr_integrator.longest_common_sub_sequence_score, 'cos_title': pr_integrator.pr_title_similarity, 'cos_description': pr_integrator.pr_description_similarity, 'activeness': pr_integrator.activeness, 'text_similarity': pr_integrator.pr_title_similarity + pr_integrator.pr_description_similarity, 'file_similarity': (pr_integrator.longest_common_prefix_score + pr_integrator.longest_common_suffix_score + pr_integrator.longest_common_sub_string_score + pr_integrator.longest_common_sub_sequence_score) } df = df.append(row, ignore_index=True) csv_file_name = database + "_test_pr_stats.csv" df.to_csv(csv_file_name, index=False)
def test_weight_combination_accuracy_for_all_prs(self, interec_processor, offset, limit, main_data_frame): query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \ "integrator_login, files " \ "FROM pull_request " \ "WHERE pr_id > '%s' and pr_id <= '%s' " \ "ORDER BY pr_id " \ "LIMIT %d" % (offset, offset + limit, limit) all_prs = self.spark.sql(query1) results = [] for i in range(1, 9): for j in range(1, 9): for k in range(1, 9): if i != 0 and j != 0 and k != 0 and i + j + k == 10: total_prs = 0 cmb_accuracy_array = [0, 0, 0] combined_mrr = 0 print("") print("---------------------------------------------------------------------------") print("alpha= " + str(i / 10) + " beta= " + str(j / 10) + " gamma= " + str(k / 10)) logging.info("") logging.info("---------------------------------------------------------------------------") logging.info("alpha= " + str(i / 10) + " beta= " + str(j / 10) + " gamma= " + str(k / 10)) for new_pr in all_prs.collect(): total_prs += 1 new_pr = PullRequest(new_pr) scores_df = main_data_frame.loc[main_data_frame['new_pr_id'] == new_pr.pr_id].copy() ranked_data_frame \ = interec_processor.generate_ranked_list(scores_df, i / 10, j / 10, k / 10) combined_rank = self.__get_actual_rank_place_of_actual_integrator(ranked_data_frame, new_pr, 'final_rank', 1) if combined_rank != 0: combined_mrr = combined_mrr + (1.0 / combined_rank) combined_accuracy \ = self.__test_combined_accuracy(ranked_data_frame, new_pr, True, True, True) if hasattr(combined_accuracy, 'top1') and combined_accuracy.top1: cmb_accuracy_array[0] += 1 if hasattr(combined_accuracy, 'top3') and combined_accuracy.top3: cmb_accuracy_array[1] += 1 if hasattr(combined_accuracy, 'top5') and combined_accuracy.top5: cmb_accuracy_array[2] += 1 combined_mrr = combined_mrr / total_prs avg_combined_top1_accuracy = cmb_accuracy_array[0] / total_prs avg_combined_top3_accuracy = cmb_accuracy_array[1] / total_prs avg_combined_top5_accuracy = cmb_accuracy_array[2] / total_prs combination_result = { 'alpha': (i / 10), 'beta': (j / 10), 'gamma': (k / 10), 'top1': avg_combined_top1_accuracy, 'top3': avg_combined_top3_accuracy, 'top5': avg_combined_top5_accuracy, 'mrr': combined_mrr } results.append(combination_result) print("---------------------------------------------------------------------------") print(" Top1 Top3 Top5") print("Combined Accuracy " + str(avg_combined_top1_accuracy) + " " + str(avg_combined_top3_accuracy) + " " + str(avg_combined_top5_accuracy)) print("Interec MRR: " + str(combined_mrr)) logging.info("---------------------------------------------------------------------------") logging.info(" Top1 Top3 Top5") logging.info("Combined Accuracy " + str(avg_combined_top1_accuracy) + " " + str(avg_combined_top3_accuracy) + " " + str(avg_combined_top5_accuracy)) logging.info("Interec MRR: " + str(combined_mrr)) return results
def test_weight_combination_accuracy_for_all_prs_with_individual_factor_accuracy(self, interec_processor, offset, limit, main_data_frame): query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \ "integrator_login, files " \ "FROM pull_request " \ "WHERE pr_id > '%s' and pr_id <= '%s' " \ "ORDER BY pr_id " \ "LIMIT %d" % (offset, offset + limit, limit) all_prs = self.spark.sql(query1) file_path__similarity_mrr = 0 text_similarity_mrr = 0 activeness_mrr = 0 file_accuracy_array = [0, 0, 0] txt_accuracy_array = [0, 0, 0] act_accuracy_array = [0, 0, 0] df = pd.DataFrame() flag = True for i in range(1, 9): for j in range(1, 9): for k in range(1, 9): if i != 0 and j != 0 and k != 0 and i + j + k == 10: total_prs = 0 cmb_accuracy_array = [0, 0, 0] combined_mrr = 0 for new_pr in all_prs.collect(): total_prs += 1 new_pr = PullRequest(new_pr) scores_df = main_data_frame.loc[main_data_frame['new_pr_id'] == new_pr.pr_id].copy() ranked_data_frame \ = interec_processor.generate_ranked_list(scores_df, i / 10, j / 10, k / 10) file_similarity_rank = self.__get_actual_rank_place_of_actual_integrator(ranked_data_frame, new_pr, 'file_path_rank', 2) if file_similarity_rank != 0: file_path__similarity_mrr = file_path__similarity_mrr + (1.0/file_similarity_rank) text_similarity_rank = self.__get_actual_rank_place_of_actual_integrator(ranked_data_frame, new_pr, 'text_rank', 3) if text_similarity_rank != 0: text_similarity_mrr = text_similarity_mrr + (1.0/text_similarity_rank) activeness_rank = self.__get_actual_rank_place_of_actual_integrator(ranked_data_frame, new_pr, 'activeness_rank', 4) if activeness_rank != 0: activeness_mrr = activeness_mrr + (1.0/activeness_rank) combined_rank = self.__get_actual_rank_place_of_actual_integrator(ranked_data_frame, new_pr, 'final_rank', 1) if combined_rank != 0: combined_mrr = combined_mrr + (1.0 / combined_rank) combined_accuracy \ = self.__test_combined_accuracy(ranked_data_frame, new_pr, True, True, True) if hasattr(combined_accuracy, 'top1') and combined_accuracy.top1: cmb_accuracy_array[0] += 1 if hasattr(combined_accuracy, 'top3') and combined_accuracy.top3: cmb_accuracy_array[1] += 1 if hasattr(combined_accuracy, 'top5') and combined_accuracy.top5: cmb_accuracy_array[2] += 1 if flag: file_path_accuracy \ = self.__test_file_path_similarity_accuracy(ranked_data_frame, new_pr, True, True, True) text_accuracy \ = self.__test_text_similarity_accuracy(ranked_data_frame, new_pr, True, True, True) activeness_accuracy \ = self.__test_activeness_accuracy(ranked_data_frame, new_pr, True, True, True) if hasattr(file_path_accuracy, 'top1') and file_path_accuracy.top1: file_accuracy_array[0] += 1 if hasattr(file_path_accuracy, 'top3') and file_path_accuracy.top3: file_accuracy_array[1] += 1 if hasattr(file_path_accuracy, 'top5') and file_path_accuracy.top5: file_accuracy_array[2] += 1 if hasattr(text_accuracy, 'top1') and text_accuracy.top1: txt_accuracy_array[0] += 1 if hasattr(text_accuracy, 'top3') and text_accuracy.top3: txt_accuracy_array[1] += 1 if hasattr(text_accuracy, 'top5') and text_accuracy.top5: txt_accuracy_array[2] += 1 if hasattr(activeness_accuracy, 'top1') and activeness_accuracy.top1: act_accuracy_array[0] += 1 if hasattr(activeness_accuracy, 'top3') and activeness_accuracy.top3: act_accuracy_array[1] += 1 if hasattr(activeness_accuracy, 'top5') and activeness_accuracy.top5: act_accuracy_array[2] += 1 combined_mrr = combined_mrr/total_prs file_path__similarity_mrr = file_path__similarity_mrr/total_prs text_similarity_mrr = text_similarity_mrr/total_prs activeness_mrr = activeness_mrr/total_prs avg_combined_top1_accuracy = cmb_accuracy_array[0] / total_prs avg_combined_top3_accuracy = cmb_accuracy_array[1] / total_prs avg_combined_top5_accuracy = cmb_accuracy_array[2] / total_prs avg_file_path_top1_accuracy = file_accuracy_array[0] / total_prs avg_file_path_top3_accuracy = file_accuracy_array[1] / total_prs avg_file_path_top5_accuracy = file_accuracy_array[2] / total_prs avg_text_top1_accuracy = txt_accuracy_array[0] / total_prs avg_text_top3_accuracy = txt_accuracy_array[1] / total_prs avg_text_top5_accuracy = txt_accuracy_array[2] / total_prs avg_act_top1_accuracy = act_accuracy_array[0] / total_prs avg_act_top3_accuracy = act_accuracy_array[1] / total_prs avg_act_top5_accuracy = act_accuracy_array[2] / total_prs if flag: print("---------------------------------------------------------------------------") print(" Top1 Top3 Top5") logging.info("---------------------------------------------------------------------------") logging.info(" Top1 Top3 Top5") print("File Path Accuracy " + str(avg_file_path_top1_accuracy) + " " + str(avg_file_path_top3_accuracy) + " " + str(avg_file_path_top5_accuracy)) print("Text Accuracy " + str(avg_text_top1_accuracy) + " " + str(avg_text_top3_accuracy) + " " + str(avg_text_top5_accuracy)) print("Activeness Accuracy " + str(avg_act_top1_accuracy) + " " + str(avg_act_top3_accuracy) + " " + str(avg_act_top5_accuracy)) print("File Path Similarity MRR: " + str(file_path__similarity_mrr)) print("Text Similarity MRR: " + str(text_similarity_mrr)) print("Activeness MRR: " + str(activeness_mrr)) logging.info("File Path Accuracy " + str(avg_file_path_top1_accuracy) + " " + str(avg_file_path_top3_accuracy) + " " + str(avg_file_path_top5_accuracy)) logging.info("Text Accuracy " + str(avg_text_top1_accuracy) + " " + str(avg_text_top3_accuracy) + " " + str(avg_text_top5_accuracy)) logging.info("Activeness Accuracy " + str(avg_act_top1_accuracy) + " " + str(avg_act_top3_accuracy) + " " + str(avg_act_top5_accuracy)) logging.info("File Path Similarity MRR: " + str(file_path__similarity_mrr)) logging.info("Text Similarity MRR: " + str(text_similarity_mrr)) logging.info("Activeness MRR: " + str(activeness_mrr)) flag = False print("") print("---------------------------------------------------------------------------") print("alpha= " + str(i / 10) + " beta= " + str(j / 10) + " gamma= " + str(k / 10)) print("---------------------------------------------------------------------------") print(" Top1 Top3 Top5") print("Combined Accuracy " + str(avg_combined_top1_accuracy) + " " + str(avg_combined_top3_accuracy) + " " + str(avg_combined_top5_accuracy)) print("Interec MRR: " + str(combined_mrr)) logging.info("---------------------------------------------------------------------------") logging.info(" Top1 Top3 Top5") logging.info("Combined Accuracy " + str(avg_combined_top1_accuracy) + " " + str(avg_combined_top3_accuracy) + " " + str(avg_combined_top5_accuracy)) logging.info("Combined MRR: " + str(combined_mrr)) row = {'alpha': str(i / 10), 'beta': str(j / 10), 'gamma': str(k / 10), 'Top1': str(avg_combined_top1_accuracy), 'Top3': str(avg_combined_top3_accuracy), 'Top5': str(avg_combined_top5_accuracy), 'MRR': str(combined_mrr)} df = df.append(row, ignore_index=True) df.to_csv(str(interec_processor.database) + ".csv", index=False)
def __calculate_scores(self, df, new_pr, date_window=120): # Calculate scores for each integrator for integrator in self.all_integrators: pr_integrator = Integrator(integrator[1]) # Read all the PRs integrator reviewed before if date_window == 0: query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \ "integrator_login, files " \ "FROM pull_request " \ "WHERE merged_date < timestamp('%s') AND integrator_login = '******'" % \ (new_pr.created_date, pr_integrator.integrator_login) integrator_reviewed_prs = self.spark.sql(query1).collect() else: query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \ "integrator_login, files " \ "FROM pull_request " \ "WHERE merged_date < timestamp('%s') " \ "AND merged_date > timestamp('%s') " \ "AND integrator_login = '******'" % \ (new_pr.created_date, new_pr.created_date - timedelta(days=date_window), pr_integrator.integrator_login) integrator_reviewed_prs = self.spark.sql(query1).collect() for integrator_reviewed_pr in integrator_reviewed_prs: old_pr = PullRequest(integrator_reviewed_pr) old_pr_file_paths = old_pr.files # Calculate file path similarity for new_pr_file_path in new_pr.files: for file_path in old_pr_file_paths: number_of_file_combinations = len( old_pr_file_paths) * len(new_pr.files) max_file_path_length = max( len(new_pr_file_path.split("/")), len(file_path.split("/"))) divider = max_file_path_length * number_of_file_combinations pr_integrator.longest_common_prefix_score += \ (self.file_path_similarity_calculator.longest_common_prefix_similarity( new_pr_file_path, file_path) / divider) pr_integrator.longest_common_suffix_score += \ (self.file_path_similarity_calculator.longest_common_suffix_similarity( new_pr_file_path, file_path) / divider) pr_integrator.longest_common_sub_string_score += \ (self.file_path_similarity_calculator.longest_common_sub_string_similarity( new_pr_file_path, file_path) / divider) pr_integrator.longest_common_sub_sequence_score += \ (self.file_path_similarity_calculator.longest_common_sub_sequence_similarity( new_pr_file_path, file_path) / divider) # Calculate cosine similarity of title pr_integrator.pr_title_similarity \ += self.text_similarity_calculator.cos_similarity(new_pr.title, old_pr.title) # Calculate cosine similarity of description if new_pr.description != "" and old_pr.description != "": pr_integrator.pr_description_similarity \ += self.text_similarity_calculator.cos_similarity(new_pr.description, old_pr.description) # Calculate activeness of the integrator pr_integrator.activeness += self.activeness_calculator.calculate_integrator_activeness( new_pr, old_pr) row = { 'new_pr_id': new_pr.pr_id, 'new_pr_number': new_pr.pull_number, 'integrator': pr_integrator.integrator_login, 'lcp': pr_integrator.longest_common_prefix_score, 'lcs': pr_integrator.longest_common_suffix_score, 'lc_substr': pr_integrator.longest_common_sub_string_score, 'ls_subseq': pr_integrator.longest_common_sub_sequence_score, 'cos_title': pr_integrator.pr_title_similarity, 'cos_description': pr_integrator.pr_description_similarity, 'activeness': pr_integrator.activeness, 'file_similarity': pr_integrator.longest_common_prefix_score + pr_integrator.longest_common_suffix_score + pr_integrator.longest_common_sub_string_score + pr_integrator.longest_common_sub_sequence_score, 'text_similarity': pr_integrator.pr_title_similarity + pr_integrator.pr_description_similarity } df = df.append(row, ignore_index=True) return df
def calculate_scores(database, new_pr): pr_integrator = Integrator(new_pr.integrator_login) # Connection to MySQL database connection = pymysql.connect(host='localhost', port=3306, user='******', passwd='', db=database) try: with connection.cursor() as cursor: # Read records query2 = "SELECT * FROM pull_request WHERE merged_date <%s AND integrator_login =%s" inputs = (new_pr.created_date.strftime('%Y-%m-%d %H:%M:%S'), new_pr.integrator_login) cursor.execute(query2, inputs) integrator_reviewed_prs = cursor.fetchall() finally: connection.close() for integrator_reviewed_pr in integrator_reviewed_prs: old_pr = PullRequest(integrator_reviewed_pr) old_pr_file_paths = old_pr.files # calculate file path similarity for new_pr_file_path in new_pr.files: for file_path in old_pr_file_paths: max_file_path_length = max(len(new_pr_file_path.split("/")), len(file_path.split("/"))) pr_integrator.longest_common_prefix_score += \ (longest_common_prefix(new_pr_file_path, file_path) / max_file_path_length) pr_integrator.longest_common_suffix_score += \ (longest_common_suffix(new_pr_file_path, file_path) / max_file_path_length) pr_integrator.longest_common_sub_string_score += \ (longest_common_sub_string(new_pr_file_path, file_path) / max_file_path_length) pr_integrator.longest_common_sub_sequence_score += \ (longest_common_sub_sequence(new_pr_file_path, file_path) / max_file_path_length) # calculate cosine similarity of title pr_integrator.pr_title_similarity += cos_similarity( new_pr.title, old_pr.title) # calculate cosine similarity of description if new_pr.description != "" and old_pr.description != "": pr_integrator.pr_description_similarity += cos_similarity( new_pr.description, old_pr.description) # calculate activeness of the integrator pr_integrator.activeness = calculate_integrator_activeness( new_pr, old_pr) # calculate number of first pulls merged, total number of prs and total commits if old_pr.first_pull == 1: pr_integrator.num_of_first_pulls += 1 pr_integrator.num_of_prs += 1 pr_integrator.total_commits += old_pr.num_of_commits # calculate first pull similarity and average commits if pr_integrator.num_of_prs == 0: first_pull_similarity = 0 average_commits = 0 else: first_pull_similarity = pr_integrator.num_of_first_pulls / pr_integrator.num_of_prs average_commits = pr_integrator.total_commits / pr_integrator.num_of_prs row = { 'lcp': pr_integrator.longest_common_prefix_score, 'lcs': pr_integrator.longest_common_suffix_score, 'lc_substr': pr_integrator.longest_common_sub_string_score, 'ls_subseq': pr_integrator.longest_common_sub_sequence_score, 'cos_title': pr_integrator.pr_title_similarity, 'cos_description': pr_integrator.pr_description_similarity, 'activeness': pr_integrator.activeness, 'first_pull': first_pull_similarity, 'avg_commits': average_commits } return row
# time_decaying_parameter const_lambda = -1 tfidf_vectorizer = TfidfVectorizer(analyzer=text_process) def cos_similarity(title1, title2): term_frequency = tfidf_vectorizer.fit_transform([title1, title2]) return (term_frequency * term_frequency.T).A[0, 1] df1 = pd.DataFrame() for test_pr in test_prs: test_pr = PullRequest(test_pr) print(test_pr.pr_id) for integrator in integrators: pr_integrator = Integrator(integrator[0]) # Connection to MySQL database connection = pymysql.connect(host='localhost', port=3306, user='******', passwd='', db='rails') try: with connection.cursor() as cursor: # Read records query2 = "SELECT * FROM pull_request WHERE merged_date <%s AND integrator_login =%s"
def test_accuracy_for_all_prs(main_data_csv_file_name, offset, limit): logging.basicConfig(level=logging.INFO, filename='app.log', format='%(name)s - %(levelname)s - %(message)s') main_df = pd.read_csv(main_data_csv_file_name) # TODO: standardise data act_min = main_df['activeness'].min() act_max = main_df['activeness'].max() file_sim_min = main_df['file_similarity'].min() file_sim_max = main_df['file_similarity'].max() txt_sim_min = main_df['text_similarity'].min() txt_sim_max = main_df['text_similarity'].max() main_df['std_activeness'] = \ main_df['activeness'].apply(standardize_score, args=(act_min, act_max)) main_df['std_file_similarity'] = \ main_df['file_similarity'].apply(standardize_score, args=(file_sim_min, file_sim_max)) main_df['std_text_similarity'] = \ main_df['text_similarity'].apply(standardize_score, args=(txt_sim_min, txt_sim_max)) query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \ "integrator_login, files " \ "FROM pull_request " \ "WHERE pr_id > '%s' and pr_id <= '%s' " \ "ORDER BY pr_id " \ "LIMIT %d" % (offset, offset + limit, limit) all_prs = spark.sql(query1) flag = True for i in range(1, 9): for j in range(1, 9): for k in range(1, 9): if i != 0 and j != 0 and k != 0 and i + j + k == 10: total_prs = 0 cmb_accuracy_array = [0, 0, 0] file_accuracy_array = [0, 0, 0] txt_accuracy_array = [0, 0, 0] act_accuracy_array = [0, 0, 0] print("") print( "---------------------------------------------------------------------------" ) print("alpha= " + str(i / 10) + " beta= " + str(j / 10) + " gamma= " + str(k / 10)) for new_pr in all_prs.collect(): total_prs += 1 new_pr = PullRequest(new_pr) scores_df = main_df.loc[main_df['new_pr_id'] == new_pr.pr_id].copy() # print(new_pr.pr_id) ranked_data_frame = generate_ranked_list( scores_df, i / 10, j / 10, k / 10) combined_accuracy = test_combined_accuracy( ranked_data_frame, new_pr, True, True, True) file_path_accuracy = test_file_path_similarity_accuracy( ranked_data_frame, new_pr, True, True, True) text_accuracy = test_text_similarity_accuracy( ranked_data_frame, new_pr, True, True, True) activeness_accuracy = test_activeness_accuracy( ranked_data_frame, new_pr, True, True, True) if hasattr(combined_accuracy, 'top1') and combined_accuracy.top1: cmb_accuracy_array[0] += 1 if hasattr(combined_accuracy, 'top3') and combined_accuracy.top3: cmb_accuracy_array[1] += 1 if hasattr(combined_accuracy, 'top5') and combined_accuracy.top5: cmb_accuracy_array[2] += 1 if hasattr(file_path_accuracy, 'top1') and file_path_accuracy.top1: file_accuracy_array[0] += 1 if hasattr(file_path_accuracy, 'top3') and file_path_accuracy.top3: file_accuracy_array[1] += 1 if hasattr(file_path_accuracy, 'top5') and file_path_accuracy.top5: file_accuracy_array[2] += 1 if hasattr(text_accuracy, 'top1') and text_accuracy.top1: txt_accuracy_array[0] += 1 if hasattr(text_accuracy, 'top3') and text_accuracy.top3: txt_accuracy_array[1] += 1 if hasattr(text_accuracy, 'top5') and text_accuracy.top5: txt_accuracy_array[2] += 1 if hasattr(activeness_accuracy, 'top1') and activeness_accuracy.top1: act_accuracy_array[0] += 1 if hasattr(activeness_accuracy, 'top3') and activeness_accuracy.top3: act_accuracy_array[1] += 1 if hasattr(activeness_accuracy, 'top5') and activeness_accuracy.top5: act_accuracy_array[2] += 1 avg_combined_top1_accuracy = cmb_accuracy_array[ 0] / total_prs avg_combined_top3_accuracy = cmb_accuracy_array[ 1] / total_prs avg_combined_top5_accuracy = cmb_accuracy_array[ 2] / total_prs avg_file_path_top1_accuracy = file_accuracy_array[ 0] / total_prs avg_file_path_top3_accuracy = file_accuracy_array[ 1] / total_prs avg_file_path_top5_accuracy = file_accuracy_array[ 2] / total_prs avg_text_top1_accuracy = txt_accuracy_array[0] / total_prs avg_text_top3_accuracy = txt_accuracy_array[1] / total_prs avg_text_top5_accuracy = txt_accuracy_array[2] / total_prs avg_act_top1_accuracy = act_accuracy_array[0] / total_prs avg_act_top3_accuracy = act_accuracy_array[1] / total_prs avg_act_top5_accuracy = act_accuracy_array[2] / total_prs print( "---------------------------------------------------------------------------" ) print( " Top1 Top3 Top5" ) print("Combined Accuracy " + str(avg_combined_top1_accuracy) + " " + str(avg_combined_top3_accuracy) + " " + str(avg_combined_top5_accuracy)) if flag: print("File Path Accuracy " + str(avg_file_path_top1_accuracy) + " " + str(avg_file_path_top3_accuracy) + " " + str(avg_file_path_top5_accuracy)) print("Text Accuracy " + str(avg_text_top1_accuracy) + " " + str(avg_text_top3_accuracy) + " " + str(avg_text_top5_accuracy)) print("Activeness Accuracy " + str(avg_act_top1_accuracy) + " " + str(avg_act_top3_accuracy) + " " + str(avg_act_top5_accuracy)) flag = False