Exemple #1
0
def calculate_scores_for_prs(database, starting_pr_number, limit):
    # TODO ADD comments for all the scripts
    logging.basicConfig(level=logging.INFO,
                        filename='app.log',
                        format='%(name)s - %(levelname)s - %(message)s')
    df1 = pd.DataFrame()

    # Connection to MySQL  database
    connection = pymysql.connect(host='localhost',
                                 port=3306,
                                 user='******',
                                 passwd='',
                                 db=database)

    try:
        with connection.cursor() as cursor:
            # Read records
            query1 = "SELECT * FROM pull_request LIMIT %s OFFSET %s"
            inputs = (limit, starting_pr_number)
            cursor.execute(query1, inputs)
            all_prs = cursor.fetchall()
    finally:
        connection.close()

    for new_pr in all_prs:
        new_pr = PullRequest(new_pr)
        row = calculate_scores(database, new_pr)
        df1 = df1.append(row, ignore_index=True)
        logging.info(new_pr.pr_id)
        print(new_pr.pr_id)

    df1.to_csv('pr_stats.csv', index=False)
    print(df1)
Exemple #2
0
    def get_related_integrators_for_pr_by_pr_number(self, pr_number):
        """
        This function calculates scores for each factor for each integrator and provides a ranked data frame which
        includes top five integrators.

        :EXAMPLE:

        >>> interec.get_related_integrators_for_pr_by_pr_number(10)

        :param pr_number: PR id number
        :type pr_number: int
        :return: Top five integrators data frame
        :rtype: DataFrame
        """
        logging.info("Getting related integrators by PR number for PR" +
                     str(pr_number) + " started")
        pr_details = self.get_pr_details(pr_number)
        new_pr = PullRequest(pr_details)
        df = pd.DataFrame()
        df = self.__calculate_scores(df, new_pr, self.date_window)
        ranked_df = self.generate_ranked_list(df, self.alpha, self.beta,
                                              self.gamma)
        sorted_ranked_data_frame = ranked_df.sort_values('final_rank',
                                                         ascending=True)
        ranked_five_df = sorted_ranked_data_frame[
            sorted_ranked_data_frame['final_rank'] <= 5]
        logging.info("Top five integrators for PR " + str(pr_number) +
                     " presented")
        return ranked_five_df
Exemple #3
0
    def get_related_integrators_for_pr(self, pr_number, requester_login, title,
                                       description, created_date_time, files):
        """
        This function calculates scores for each factor for each integrator and provides a ranked data frame which
        includes top five integrators.

        :EXAMPLE:

        >>> interec.get_related_integrators_for_pr(10, 'John', 'PR Title', 'PR Description', '2019-03-10 17:52:31', 'abc.js|def.js|ghi.js')

        :param pr_number: PR id number
        :type pr_number: int
        :param requester_login: Contributor username
        :type requester_login: String
        :param title: Title of the PR
        :type title: String
        :param description: Description of the PR
        :type description: String
        :param created_date_time: PR created date and the time
        :type created_date_time: String
        :param files: File paths of the PR
        :type files: String
        :return: Top five integrators data frame
        :rtype: DataFrame
        """
        logging.info("Getting related integrators by PR details for PR " +
                     str(pr_number) + " started")
        created_date_time = datetime.strptime(created_date_time,
                                              '%Y-%m-%d %H:%M:%S')
        pr_data = [
            0, pr_number, requester_login, title, description,
            created_date_time, 0, " ", files
        ]
        new_pr = PullRequest(pr_data)
        df = pd.DataFrame()
        df = self.__calculate_scores(df, new_pr, self.date_window)
        ranked_df = self.generate_ranked_list(df, self.alpha, self.beta,
                                              self.gamma)
        sorted_ranked_data_frame = ranked_df.sort_values('final_rank',
                                                         ascending=True)
        ranked_five_df = sorted_ranked_data_frame[
            sorted_ranked_data_frame['final_rank'] <= 5]
        logging.info("Top five integrators for PR " + str(pr_number) +
                     " presented")
        return ranked_five_df
Exemple #4
0
def get_recommendation(column_name, required_integrators):  # new_pr,
    logging.basicConfig(level=logging.INFO,
                        filename='app.log',
                        format='%(name)s - %(levelname)s - %(message)s')

    query_demo = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \
                 "integrator_login, files " \
                 "FROM pull_request " \
                 "WHERE pr_id = '2001'"
    pr_demo = spark.sql(query_demo)
    new_pr = pr_demo.collect()[0]
    new_pr = PullRequest(new_pr)

    print(new_pr.pr_id)
    ranked_data_frame = generate_ranked_list(new_pr)
    sorted_ranked_data_frame = ranked_data_frame.sort_values(column_name,
                                                             ascending=True)
    recommended_integrators = sorted_ranked_data_frame[
        sorted_ranked_data_frame[column_name] <= required_integrators]
    print("Position             Integrator")
    print("-------------------------------")
    for row in recommended_integrators.itertuples(index=False):
        print("  " + str(row.final_rank) + "                " + row.integrator)
Exemple #5
0
    def __calculate_scores_for_all_prs(self, offset, limit, date_window=120):
        query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \
                 "integrator_login, files " \
                 "FROM pull_request " \
                 "WHERE pr_id > '%s' and pr_id <= '%s' " \
                 "ORDER BY pr_id " \
                 "LIMIT %d" % (offset, offset + limit, limit)
        all_prs = self.spark.sql(query1)

        total_prs = 0
        df = pd.DataFrame()

        for new_pr in all_prs.collect():
            total_prs += 1
            new_pr = PullRequest(new_pr)
            df = self.__calculate_scores(df, new_pr, date_window)
            print("Scores calculated for: " + str(date_window) + "_" +
                  str(new_pr.pr_id))
            logging.info("Scores calculated for: " + str(date_window) + "_" +
                         str(new_pr.pr_id))
        df.to_csv(str(date_window) + "_" + self.database +
                  "_all_integrator_scores_for_each_test_pr.csv",
                  index=False)
        return df
Exemple #6
0
def calculate_scores(new_pr):
    df1 = pd.DataFrame()

    # Calculate scores for each integrator
    for integrator in all_integrators:
        pr_integrator = Integrator(integrator[1])

        # Read all the PRs integrator reviewed before
        query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \
                 "integrator_login, files " \
                 "FROM pull_request " \
                 "WHERE merged_date < timestamp('%s') AND integrator_login = '******'" % \
                 (new_pr.created_date, pr_integrator.integrator_login)
        integrator_reviewed_prs = spark.sql(query1).collect()
        print(len(integrator_reviewed_prs))  # TODO:Remove this

        for integrator_reviewed_pr in integrator_reviewed_prs:
            old_pr = PullRequest(integrator_reviewed_pr)
            old_pr_file_paths = old_pr.files

            # Calculate file path similarity
            for new_pr_file_path in new_pr.files:
                for file_path in old_pr_file_paths:
                    number_of_file_combinations = len(old_pr_file_paths) * len(
                        new_pr.files)
                    max_file_path_length = max(
                        len(new_pr_file_path.split("/")),
                        len(file_path.split("/")))
                    divider = max_file_path_length * number_of_file_combinations

                    pr_integrator.longest_common_prefix_score += \
                        (longest_common_prefix(new_pr_file_path, file_path) / divider)
                    pr_integrator.longest_common_suffix_score += \
                        (longest_common_suffix(new_pr_file_path, file_path) / divider)
                    pr_integrator.longest_common_sub_string_score += \
                        (longest_common_sub_string(new_pr_file_path, file_path) / divider)
                    pr_integrator.longest_common_sub_sequence_score += \
                        (longest_common_sub_sequence(new_pr_file_path, file_path) / divider)

            # Calculate cosine similarity of title
            pr_integrator.pr_title_similarity += cos_similarity(
                new_pr.title, old_pr.title)

            # Calculate cosine similarity of description
            if new_pr.description != "" and old_pr.description != "":
                pr_integrator.pr_description_similarity += cos_similarity(
                    new_pr.description, old_pr.description)

            # Calculate activeness of the integrator
            pr_integrator.activeness += calculate_integrator_activeness(
                new_pr, old_pr)

        row = {
            'integrator': pr_integrator.integrator_login,
            'lcp': pr_integrator.longest_common_prefix_score,
            'lcs': pr_integrator.longest_common_suffix_score,
            'lc_substr': pr_integrator.longest_common_sub_string_score,
            'ls_subseq': pr_integrator.longest_common_sub_sequence_score,
            'cos_title': pr_integrator.pr_title_similarity,
            'cos_description': pr_integrator.pr_description_similarity,
            'activeness': pr_integrator.activeness
        }
        df1 = df1.append(row, ignore_index=True)
    return df1
Exemple #7
0
def test_accuracy_for_all_prs(offset, limit):
    logging.basicConfig(level=logging.INFO,
                        filename='app.log',
                        format='%(name)s - %(levelname)s - %(message)s')

    query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \
             "integrator_login, files " \
             "FROM pull_request " \
             "WHERE pr_id > '%s' and pr_id <= '%s' " \
             "ORDER BY pr_id " \
             "LIMIT %d" % (offset, offset + limit, limit)
    all_prs = spark.sql(query1)

    total_prs = 0
    cmb_accuracy_array = [0, 0, 0]
    file_accuracy_array = [0, 0, 0]
    txt_accuracy_array = [0, 0, 0]
    act_accuracy_array = [0, 0, 0]

    for new_pr in all_prs.collect():
        total_prs += 1
        new_pr = PullRequest(new_pr)
        print(new_pr.pr_id)  # TODO: Remove this
        ranked_data_frame = generate_ranked_list(new_pr)
        combined_accuracy = test_combined_accuracy(ranked_data_frame, new_pr,
                                                   True, True, True)
        file_path_accuracy = test_file_path_similarity_accuracy(
            ranked_data_frame, new_pr, True, True, True)
        text_accuracy = test_text_similarity_accuracy(ranked_data_frame,
                                                      new_pr, True, True, True)
        activeness_accuracy = test_activeness_accuracy(ranked_data_frame,
                                                       new_pr, True, True,
                                                       True)

        if hasattr(combined_accuracy, 'top1') and combined_accuracy.top1:
            cmb_accuracy_array[0] += 1
        if hasattr(combined_accuracy, 'top3') and combined_accuracy.top3:
            cmb_accuracy_array[1] += 1
        if hasattr(combined_accuracy, 'top5') and combined_accuracy.top5:
            cmb_accuracy_array[2] += 1

        if hasattr(file_path_accuracy, 'top1') and file_path_accuracy.top1:
            file_accuracy_array[0] += 1
        if hasattr(file_path_accuracy, 'top3') and file_path_accuracy.top3:
            file_accuracy_array[1] += 1
        if hasattr(file_path_accuracy, 'top5') and file_path_accuracy.top5:
            file_accuracy_array[2] += 1

        if hasattr(text_accuracy, 'top1') and text_accuracy.top1:
            txt_accuracy_array[0] += 1
        if hasattr(text_accuracy, 'top3') and text_accuracy.top3:
            txt_accuracy_array[1] += 1
        if hasattr(text_accuracy, 'top5') and text_accuracy.top5:
            txt_accuracy_array[2] += 1

        if hasattr(activeness_accuracy, 'top1') and activeness_accuracy.top1:
            act_accuracy_array[0] += 1
        if hasattr(activeness_accuracy, 'top3') and activeness_accuracy.top3:
            act_accuracy_array[1] += 1
        if hasattr(activeness_accuracy, 'top5') and activeness_accuracy.top5:
            act_accuracy_array[2] += 1

    avg_combined_top1_accuracy = cmb_accuracy_array[0] / total_prs
    avg_combined_top3_accuracy = cmb_accuracy_array[1] / total_prs
    avg_combined_top5_accuracy = cmb_accuracy_array[2] / total_prs

    avg_file_path_top1_accuracy = file_accuracy_array[0] / total_prs
    avg_file_path_top3_accuracy = file_accuracy_array[1] / total_prs
    avg_file_path_top5_accuracy = file_accuracy_array[2] / total_prs

    avg_text_top1_accuracy = txt_accuracy_array[0] / total_prs
    avg_text_top3_accuracy = txt_accuracy_array[1] / total_prs
    avg_text_top5_accuracy = txt_accuracy_array[2] / total_prs

    avg_act_top1_accuracy = act_accuracy_array[0] / total_prs
    avg_act_top3_accuracy = act_accuracy_array[1] / total_prs
    avg_act_top5_accuracy = act_accuracy_array[2] / total_prs

    print(
        "---------------------------------------------------------------------------"
    )
    print("                         Top1          Top3            Top5")
    print("Combined Accuracy         " + str(avg_combined_top1_accuracy) +
          "          " + str(avg_combined_top3_accuracy) + "         " +
          str(avg_combined_top5_accuracy))
    print("File Path Accuracy        " + str(avg_file_path_top1_accuracy) +
          "          " + str(avg_file_path_top3_accuracy) + "         " +
          str(avg_file_path_top5_accuracy))
    print("Text Accuracy             " + str(avg_text_top1_accuracy) +
          "          " + str(avg_text_top3_accuracy) + "         " +
          str(avg_text_top5_accuracy))
    print("Activeness Accuracy       " + str(avg_act_top1_accuracy) +
          "          " + str(avg_act_top3_accuracy) + "         " +
          str(avg_act_top5_accuracy))
Exemple #8
0
def calculate_scores(offset, limit):
    df = pd.DataFrame()

    logging.basicConfig(level=logging.INFO,
                        filename='app.log',
                        format='%(name)s - %(levelname)s - %(message)s')

    query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \
             "integrator_login, files " \
             "FROM pull_request " \
             "WHERE pr_id > '%s' and pr_id <= '%s' " \
             "ORDER BY pr_id " \
             "LIMIT %d" % (offset, offset + limit, limit)
    all_prs = spark.sql(query1)

    for test_pr in all_prs.collect():
        test_pr = PullRequest(test_pr)
        print(test_pr.pr_id)
        logging.info(test_pr.pr_id)
        pr_integrator = Integrator(test_pr.integrator_login)
        # Calculate scores for integrator

        # Read all the PRs integrator reviewed before
        query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \
                 "integrator_login, files " \
                 "FROM pull_request " \
                 "WHERE merged_date < timestamp('%s') AND integrator_login = '******'" % \
                 (test_pr.created_date, pr_integrator.integrator_login)
        integrator_reviewed_prs = spark.sql(query1).collect()

        for integrator_reviewed_pr in integrator_reviewed_prs:
            old_pr = PullRequest(integrator_reviewed_pr)
            old_pr_file_paths = old_pr.files

            # Calculate file path similarity
            for new_pr_file_path in test_pr.files:
                for file_path in old_pr_file_paths:
                    number_of_file_combinations = len(old_pr_file_paths) * len(
                        test_pr.files)
                    max_file_path_length = max(
                        len(new_pr_file_path.split("/")),
                        len(file_path.split("/")))
                    divider = max_file_path_length * number_of_file_combinations

                    pr_integrator.longest_common_prefix_score += \
                        (longest_common_prefix(new_pr_file_path, file_path) / divider)
                    pr_integrator.longest_common_suffix_score += \
                        (longest_common_suffix(new_pr_file_path, file_path) / divider)
                    pr_integrator.longest_common_sub_string_score += \
                        (longest_common_sub_string(new_pr_file_path, file_path) / divider)
                    pr_integrator.longest_common_sub_sequence_score += \
                        (longest_common_sub_sequence(new_pr_file_path, file_path) / divider)

            # Calculate cosine similarity of title
            pr_integrator.pr_title_similarity += cos_similarity(
                test_pr.title, old_pr.title)

            # Calculate cosine similarity of description
            if test_pr.description != "" and old_pr.description != "":
                pr_integrator.pr_description_similarity += cos_similarity(
                    test_pr.description, old_pr.description)

            # Calculate activeness of the integrator
            pr_integrator.activeness += calculate_integrator_activeness(
                test_pr, old_pr)

        row = {
            'pr_id':
            test_pr.pr_id,
            'integrator':
            pr_integrator.integrator_login,
            'lcp':
            pr_integrator.longest_common_prefix_score,
            'lcs':
            pr_integrator.longest_common_suffix_score,
            'lc_substr':
            pr_integrator.longest_common_sub_string_score,
            'ls_subseq':
            pr_integrator.longest_common_sub_sequence_score,
            'cos_title':
            pr_integrator.pr_title_similarity,
            'cos_description':
            pr_integrator.pr_description_similarity,
            'activeness':
            pr_integrator.activeness,
            'text_similarity':
            pr_integrator.pr_title_similarity +
            pr_integrator.pr_description_similarity,
            'file_similarity':
            (pr_integrator.longest_common_prefix_score +
             pr_integrator.longest_common_suffix_score +
             pr_integrator.longest_common_sub_string_score +
             pr_integrator.longest_common_sub_sequence_score)
        }
        df = df.append(row, ignore_index=True)
    csv_file_name = database + "_test_pr_stats.csv"
    df.to_csv(csv_file_name, index=False)
    def test_weight_combination_accuracy_for_all_prs(self, interec_processor, offset, limit, main_data_frame):
        query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \
                 "integrator_login, files " \
                 "FROM pull_request " \
                 "WHERE pr_id > '%s' and pr_id <= '%s' " \
                 "ORDER BY pr_id " \
                 "LIMIT %d" % (offset, offset + limit, limit)
        all_prs = self.spark.sql(query1)

        results = []
        for i in range(1, 9):
            for j in range(1, 9):
                for k in range(1, 9):
                    if i != 0 and j != 0 and k != 0 and i + j + k == 10:
                        total_prs = 0
                        cmb_accuracy_array = [0, 0, 0]
                        combined_mrr = 0

                        print("")
                        print("---------------------------------------------------------------------------")
                        print("alpha= " + str(i / 10) + " beta= " + str(j / 10) + " gamma= " + str(k / 10))
                        logging.info("")
                        logging.info("---------------------------------------------------------------------------")
                        logging.info("alpha= " + str(i / 10) + " beta= " + str(j / 10) + " gamma= " + str(k / 10))

                        for new_pr in all_prs.collect():
                            total_prs += 1
                            new_pr = PullRequest(new_pr)

                            scores_df = main_data_frame.loc[main_data_frame['new_pr_id'] == new_pr.pr_id].copy()

                            ranked_data_frame \
                                = interec_processor.generate_ranked_list(scores_df, i / 10, j / 10, k / 10)

                            combined_rank = self.__get_actual_rank_place_of_actual_integrator(ranked_data_frame,
                                                                                              new_pr, 'final_rank', 1)
                            if combined_rank != 0:
                                combined_mrr = combined_mrr + (1.0 / combined_rank)

                            combined_accuracy \
                                = self.__test_combined_accuracy(ranked_data_frame, new_pr, True, True, True)

                            if hasattr(combined_accuracy, 'top1') and combined_accuracy.top1:
                                cmb_accuracy_array[0] += 1
                            if hasattr(combined_accuracy, 'top3') and combined_accuracy.top3:
                                cmb_accuracy_array[1] += 1
                            if hasattr(combined_accuracy, 'top5') and combined_accuracy.top5:
                                cmb_accuracy_array[2] += 1

                        combined_mrr = combined_mrr / total_prs

                        avg_combined_top1_accuracy = cmb_accuracy_array[0] / total_prs
                        avg_combined_top3_accuracy = cmb_accuracy_array[1] / total_prs
                        avg_combined_top5_accuracy = cmb_accuracy_array[2] / total_prs

                        combination_result = {
                            'alpha': (i / 10),
                            'beta': (j / 10),
                            'gamma': (k / 10),
                            'top1': avg_combined_top1_accuracy,
                            'top3': avg_combined_top3_accuracy,
                            'top5': avg_combined_top5_accuracy,
                            'mrr': combined_mrr
                        }

                        results.append(combination_result)

                        print("---------------------------------------------------------------------------")
                        print("                         Top1          Top3            Top5")
                        print("Combined Accuracy         " + str(avg_combined_top1_accuracy) + "          " +
                              str(avg_combined_top3_accuracy) + "         " + str(avg_combined_top5_accuracy))
                        print("Interec MRR: " + str(combined_mrr))
                        logging.info("---------------------------------------------------------------------------")
                        logging.info("                         Top1          Top3            Top5")
                        logging.info("Combined Accuracy         " + str(avg_combined_top1_accuracy) + "          " +
                                     str(avg_combined_top3_accuracy) + "         " + str(avg_combined_top5_accuracy))
                        logging.info("Interec MRR: " + str(combined_mrr))
        return results
    def test_weight_combination_accuracy_for_all_prs_with_individual_factor_accuracy(self, interec_processor, offset,
                                                                                     limit, main_data_frame):
        query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \
                 "integrator_login, files " \
                 "FROM pull_request " \
                 "WHERE pr_id > '%s' and pr_id <= '%s' " \
                 "ORDER BY pr_id " \
                 "LIMIT %d" % (offset, offset + limit, limit)
        all_prs = self.spark.sql(query1)

        file_path__similarity_mrr = 0
        text_similarity_mrr = 0
        activeness_mrr = 0

        file_accuracy_array = [0, 0, 0]
        txt_accuracy_array = [0, 0, 0]
        act_accuracy_array = [0, 0, 0]
        df = pd.DataFrame()
        flag = True
        for i in range(1, 9):
            for j in range(1, 9):
                for k in range(1, 9):
                    if i != 0 and j != 0 and k != 0 and i + j + k == 10:
                        total_prs = 0
                        cmb_accuracy_array = [0, 0, 0]
                        combined_mrr = 0

                        for new_pr in all_prs.collect():
                            total_prs += 1
                            new_pr = PullRequest(new_pr)

                            scores_df = main_data_frame.loc[main_data_frame['new_pr_id'] == new_pr.pr_id].copy()

                            ranked_data_frame \
                                = interec_processor.generate_ranked_list(scores_df, i / 10, j / 10, k / 10)

                            file_similarity_rank = self.__get_actual_rank_place_of_actual_integrator(ranked_data_frame,
                                                                                                     new_pr,
                                                                                                     'file_path_rank',
                                                                                                     2)
                            if file_similarity_rank != 0:
                                file_path__similarity_mrr = file_path__similarity_mrr + (1.0/file_similarity_rank)

                            text_similarity_rank = self.__get_actual_rank_place_of_actual_integrator(ranked_data_frame,
                                                                                                     new_pr,
                                                                                                     'text_rank', 3)
                            if text_similarity_rank != 0:
                                text_similarity_mrr = text_similarity_mrr + (1.0/text_similarity_rank)

                            activeness_rank = self.__get_actual_rank_place_of_actual_integrator(ranked_data_frame,
                                                                                                new_pr,
                                                                                                'activeness_rank', 4)
                            if activeness_rank != 0:
                                activeness_mrr = activeness_mrr + (1.0/activeness_rank)

                            combined_rank = self.__get_actual_rank_place_of_actual_integrator(ranked_data_frame,
                                                                                              new_pr, 'final_rank', 1)
                            if combined_rank != 0:
                                combined_mrr = combined_mrr + (1.0 / combined_rank)

                            combined_accuracy \
                                = self.__test_combined_accuracy(ranked_data_frame, new_pr, True, True, True)

                            if hasattr(combined_accuracy, 'top1') and combined_accuracy.top1:
                                cmb_accuracy_array[0] += 1
                            if hasattr(combined_accuracy, 'top3') and combined_accuracy.top3:
                                cmb_accuracy_array[1] += 1
                            if hasattr(combined_accuracy, 'top5') and combined_accuracy.top5:
                                cmb_accuracy_array[2] += 1

                            if flag:
                                file_path_accuracy \
                                    = self.__test_file_path_similarity_accuracy(ranked_data_frame, new_pr, True, True, True)
                                text_accuracy \
                                    = self.__test_text_similarity_accuracy(ranked_data_frame, new_pr, True, True, True)
                                activeness_accuracy \
                                    = self.__test_activeness_accuracy(ranked_data_frame, new_pr, True, True, True)

                                if hasattr(file_path_accuracy, 'top1') and file_path_accuracy.top1:
                                    file_accuracy_array[0] += 1
                                if hasattr(file_path_accuracy, 'top3') and file_path_accuracy.top3:
                                    file_accuracy_array[1] += 1
                                if hasattr(file_path_accuracy, 'top5') and file_path_accuracy.top5:
                                    file_accuracy_array[2] += 1

                                if hasattr(text_accuracy, 'top1') and text_accuracy.top1:
                                    txt_accuracy_array[0] += 1
                                if hasattr(text_accuracy, 'top3') and text_accuracy.top3:
                                    txt_accuracy_array[1] += 1
                                if hasattr(text_accuracy, 'top5') and text_accuracy.top5:
                                    txt_accuracy_array[2] += 1

                                if hasattr(activeness_accuracy, 'top1') and activeness_accuracy.top1:
                                    act_accuracy_array[0] += 1
                                if hasattr(activeness_accuracy, 'top3') and activeness_accuracy.top3:
                                    act_accuracy_array[1] += 1
                                if hasattr(activeness_accuracy, 'top5') and activeness_accuracy.top5:
                                    act_accuracy_array[2] += 1

                        combined_mrr = combined_mrr/total_prs
                        file_path__similarity_mrr = file_path__similarity_mrr/total_prs
                        text_similarity_mrr = text_similarity_mrr/total_prs
                        activeness_mrr = activeness_mrr/total_prs

                        avg_combined_top1_accuracy = cmb_accuracy_array[0] / total_prs
                        avg_combined_top3_accuracy = cmb_accuracy_array[1] / total_prs
                        avg_combined_top5_accuracy = cmb_accuracy_array[2] / total_prs

                        avg_file_path_top1_accuracy = file_accuracy_array[0] / total_prs
                        avg_file_path_top3_accuracy = file_accuracy_array[1] / total_prs
                        avg_file_path_top5_accuracy = file_accuracy_array[2] / total_prs

                        avg_text_top1_accuracy = txt_accuracy_array[0] / total_prs
                        avg_text_top3_accuracy = txt_accuracy_array[1] / total_prs
                        avg_text_top5_accuracy = txt_accuracy_array[2] / total_prs

                        avg_act_top1_accuracy = act_accuracy_array[0] / total_prs
                        avg_act_top3_accuracy = act_accuracy_array[1] / total_prs
                        avg_act_top5_accuracy = act_accuracy_array[2] / total_prs

                        if flag:
                            print("---------------------------------------------------------------------------")
                            print("                         Top1          Top3            Top5")
                            logging.info("---------------------------------------------------------------------------")
                            logging.info("                         Top1          Top3            Top5")
                            print("File Path Accuracy        " + str(avg_file_path_top1_accuracy) + "          " +
                                  str(avg_file_path_top3_accuracy) + "         " + str(avg_file_path_top5_accuracy))
                            print("Text Accuracy             " + str(avg_text_top1_accuracy) + "          " +
                                  str(avg_text_top3_accuracy) + "         " + str(avg_text_top5_accuracy))
                            print("Activeness Accuracy       " + str(avg_act_top1_accuracy) + "          " +
                                  str(avg_act_top3_accuracy) + "         " + str(avg_act_top5_accuracy))
                            print("File Path Similarity MRR: " + str(file_path__similarity_mrr))
                            print("Text Similarity MRR: " + str(text_similarity_mrr))
                            print("Activeness MRR: " + str(activeness_mrr))
                            logging.info("File Path Accuracy        " + str(avg_file_path_top1_accuracy) + "          "
                                         + str(avg_file_path_top3_accuracy) + "         "
                                         + str(avg_file_path_top5_accuracy))
                            logging.info("Text Accuracy             " + str(avg_text_top1_accuracy) + "          " +
                                         str(avg_text_top3_accuracy) + "         " + str(avg_text_top5_accuracy))
                            logging.info("Activeness Accuracy       " + str(avg_act_top1_accuracy) + "          " +
                                         str(avg_act_top3_accuracy) + "         " + str(avg_act_top5_accuracy))
                            logging.info("File Path Similarity MRR: " + str(file_path__similarity_mrr))
                            logging.info("Text Similarity MRR: " + str(text_similarity_mrr))
                            logging.info("Activeness MRR: " + str(activeness_mrr))
                        flag = False

                        print("")
                        print("---------------------------------------------------------------------------")
                        print("alpha= " + str(i / 10) + " beta= " + str(j / 10) + " gamma= " + str(k / 10))
                        print("---------------------------------------------------------------------------")
                        print("                         Top1          Top3            Top5")
                        print("Combined Accuracy         " + str(avg_combined_top1_accuracy) + "          " +
                              str(avg_combined_top3_accuracy) + "         " + str(avg_combined_top5_accuracy))
                        print("Interec MRR: " + str(combined_mrr))
                        logging.info("---------------------------------------------------------------------------")
                        logging.info("                         Top1          Top3            Top5")
                        logging.info("Combined Accuracy         " + str(avg_combined_top1_accuracy) + "          " +
                                     str(avg_combined_top3_accuracy) + "         " + str(avg_combined_top5_accuracy))
                        logging.info("Combined MRR: " + str(combined_mrr))

                        row = {'alpha': str(i / 10),
                               'beta': str(j / 10),
                               'gamma': str(k / 10),
                               'Top1': str(avg_combined_top1_accuracy),
                               'Top3': str(avg_combined_top3_accuracy),
                               'Top5': str(avg_combined_top5_accuracy),
                               'MRR': str(combined_mrr)}
                        df = df.append(row, ignore_index=True)

        df.to_csv(str(interec_processor.database) + ".csv", index=False)
Exemple #11
0
    def __calculate_scores(self, df, new_pr, date_window=120):
        # Calculate scores for each integrator
        for integrator in self.all_integrators:
            pr_integrator = Integrator(integrator[1])

            # Read all the PRs integrator reviewed before
            if date_window == 0:
                query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \
                         "integrator_login, files " \
                         "FROM pull_request " \
                         "WHERE merged_date < timestamp('%s') AND integrator_login = '******'" % \
                         (new_pr.created_date, pr_integrator.integrator_login)
                integrator_reviewed_prs = self.spark.sql(query1).collect()
            else:
                query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \
                         "integrator_login, files " \
                         "FROM pull_request " \
                         "WHERE merged_date < timestamp('%s') " \
                         "AND merged_date > timestamp('%s') " \
                         "AND integrator_login = '******'" % \
                         (new_pr.created_date, new_pr.created_date - timedelta(days=date_window),
                          pr_integrator.integrator_login)
                integrator_reviewed_prs = self.spark.sql(query1).collect()

            for integrator_reviewed_pr in integrator_reviewed_prs:
                old_pr = PullRequest(integrator_reviewed_pr)
                old_pr_file_paths = old_pr.files

                # Calculate file path similarity
                for new_pr_file_path in new_pr.files:
                    for file_path in old_pr_file_paths:
                        number_of_file_combinations = len(
                            old_pr_file_paths) * len(new_pr.files)
                        max_file_path_length = max(
                            len(new_pr_file_path.split("/")),
                            len(file_path.split("/")))
                        divider = max_file_path_length * number_of_file_combinations

                        pr_integrator.longest_common_prefix_score += \
                            (self.file_path_similarity_calculator.longest_common_prefix_similarity(
                                new_pr_file_path, file_path) / divider)
                        pr_integrator.longest_common_suffix_score += \
                            (self.file_path_similarity_calculator.longest_common_suffix_similarity(
                                new_pr_file_path, file_path) / divider)
                        pr_integrator.longest_common_sub_string_score += \
                            (self.file_path_similarity_calculator.longest_common_sub_string_similarity(
                                new_pr_file_path, file_path) / divider)
                        pr_integrator.longest_common_sub_sequence_score += \
                            (self.file_path_similarity_calculator.longest_common_sub_sequence_similarity(
                                new_pr_file_path, file_path) / divider)

                # Calculate cosine similarity of title
                pr_integrator.pr_title_similarity \
                    += self.text_similarity_calculator.cos_similarity(new_pr.title, old_pr.title)

                # Calculate cosine similarity of description
                if new_pr.description != "" and old_pr.description != "":
                    pr_integrator.pr_description_similarity \
                        += self.text_similarity_calculator.cos_similarity(new_pr.description, old_pr.description)

                # Calculate activeness of the integrator
                pr_integrator.activeness += self.activeness_calculator.calculate_integrator_activeness(
                    new_pr, old_pr)

            row = {
                'new_pr_id':
                new_pr.pr_id,
                'new_pr_number':
                new_pr.pull_number,
                'integrator':
                pr_integrator.integrator_login,
                'lcp':
                pr_integrator.longest_common_prefix_score,
                'lcs':
                pr_integrator.longest_common_suffix_score,
                'lc_substr':
                pr_integrator.longest_common_sub_string_score,
                'ls_subseq':
                pr_integrator.longest_common_sub_sequence_score,
                'cos_title':
                pr_integrator.pr_title_similarity,
                'cos_description':
                pr_integrator.pr_description_similarity,
                'activeness':
                pr_integrator.activeness,
                'file_similarity':
                pr_integrator.longest_common_prefix_score +
                pr_integrator.longest_common_suffix_score +
                pr_integrator.longest_common_sub_string_score +
                pr_integrator.longest_common_sub_sequence_score,
                'text_similarity':
                pr_integrator.pr_title_similarity +
                pr_integrator.pr_description_similarity
            }
            df = df.append(row, ignore_index=True)
        return df
Exemple #12
0
def calculate_scores(database, new_pr):
    pr_integrator = Integrator(new_pr.integrator_login)

    # Connection to MySQL  database
    connection = pymysql.connect(host='localhost',
                                 port=3306,
                                 user='******',
                                 passwd='',
                                 db=database)

    try:
        with connection.cursor() as cursor:
            # Read records
            query2 = "SELECT * FROM pull_request WHERE merged_date <%s AND integrator_login =%s"
            inputs = (new_pr.created_date.strftime('%Y-%m-%d %H:%M:%S'),
                      new_pr.integrator_login)
            cursor.execute(query2, inputs)
            integrator_reviewed_prs = cursor.fetchall()
    finally:
        connection.close()

    for integrator_reviewed_pr in integrator_reviewed_prs:

        old_pr = PullRequest(integrator_reviewed_pr)
        old_pr_file_paths = old_pr.files

        # calculate file path similarity
        for new_pr_file_path in new_pr.files:
            for file_path in old_pr_file_paths:
                max_file_path_length = max(len(new_pr_file_path.split("/")),
                                           len(file_path.split("/")))
                pr_integrator.longest_common_prefix_score += \
                    (longest_common_prefix(new_pr_file_path, file_path) / max_file_path_length)
                pr_integrator.longest_common_suffix_score += \
                    (longest_common_suffix(new_pr_file_path, file_path) / max_file_path_length)
                pr_integrator.longest_common_sub_string_score += \
                    (longest_common_sub_string(new_pr_file_path, file_path) / max_file_path_length)
                pr_integrator.longest_common_sub_sequence_score += \
                    (longest_common_sub_sequence(new_pr_file_path, file_path) / max_file_path_length)

        # calculate cosine similarity of title
        pr_integrator.pr_title_similarity += cos_similarity(
            new_pr.title, old_pr.title)

        # calculate cosine similarity of description
        if new_pr.description != "" and old_pr.description != "":
            pr_integrator.pr_description_similarity += cos_similarity(
                new_pr.description, old_pr.description)

        # calculate activeness of the integrator
        pr_integrator.activeness = calculate_integrator_activeness(
            new_pr, old_pr)

        # calculate number of first pulls merged, total number of prs and total commits
        if old_pr.first_pull == 1:
            pr_integrator.num_of_first_pulls += 1
        pr_integrator.num_of_prs += 1
        pr_integrator.total_commits += old_pr.num_of_commits

    # calculate first pull similarity and average commits
    if pr_integrator.num_of_prs == 0:
        first_pull_similarity = 0
        average_commits = 0
    else:
        first_pull_similarity = pr_integrator.num_of_first_pulls / pr_integrator.num_of_prs
        average_commits = pr_integrator.total_commits / pr_integrator.num_of_prs

    row = {
        'lcp': pr_integrator.longest_common_prefix_score,
        'lcs': pr_integrator.longest_common_suffix_score,
        'lc_substr': pr_integrator.longest_common_sub_string_score,
        'ls_subseq': pr_integrator.longest_common_sub_sequence_score,
        'cos_title': pr_integrator.pr_title_similarity,
        'cos_description': pr_integrator.pr_description_similarity,
        'activeness': pr_integrator.activeness,
        'first_pull': first_pull_similarity,
        'avg_commits': average_commits
    }
    return row
Exemple #13
0
# time_decaying_parameter
const_lambda = -1

tfidf_vectorizer = TfidfVectorizer(analyzer=text_process)


def cos_similarity(title1, title2):
    term_frequency = tfidf_vectorizer.fit_transform([title1, title2])
    return (term_frequency * term_frequency.T).A[0, 1]


df1 = pd.DataFrame()

for test_pr in test_prs:
    test_pr = PullRequest(test_pr)
    print(test_pr.pr_id)
    for integrator in integrators:
        pr_integrator = Integrator(integrator[0])

        # Connection to MySQL  database
        connection = pymysql.connect(host='localhost',
                                     port=3306,
                                     user='******',
                                     passwd='',
                                     db='rails')

        try:
            with connection.cursor() as cursor:
                # Read records
                query2 = "SELECT * FROM pull_request WHERE merged_date <%s AND integrator_login =%s"
Exemple #14
0
def test_accuracy_for_all_prs(main_data_csv_file_name, offset, limit):
    logging.basicConfig(level=logging.INFO,
                        filename='app.log',
                        format='%(name)s - %(levelname)s - %(message)s')

    main_df = pd.read_csv(main_data_csv_file_name)
    # TODO: standardise data
    act_min = main_df['activeness'].min()
    act_max = main_df['activeness'].max()
    file_sim_min = main_df['file_similarity'].min()
    file_sim_max = main_df['file_similarity'].max()
    txt_sim_min = main_df['text_similarity'].min()
    txt_sim_max = main_df['text_similarity'].max()

    main_df['std_activeness'] = \
        main_df['activeness'].apply(standardize_score, args=(act_min, act_max))
    main_df['std_file_similarity'] = \
        main_df['file_similarity'].apply(standardize_score, args=(file_sim_min, file_sim_max))
    main_df['std_text_similarity'] = \
        main_df['text_similarity'].apply(standardize_score, args=(txt_sim_min, txt_sim_max))

    query1 = "SELECT pr_id, pull_number, requester_login, title, description, created_date, merged_date, " \
             "integrator_login, files " \
             "FROM pull_request " \
             "WHERE pr_id > '%s' and pr_id <= '%s' " \
             "ORDER BY pr_id " \
             "LIMIT %d" % (offset, offset + limit, limit)
    all_prs = spark.sql(query1)

    flag = True
    for i in range(1, 9):
        for j in range(1, 9):
            for k in range(1, 9):
                if i != 0 and j != 0 and k != 0 and i + j + k == 10:
                    total_prs = 0
                    cmb_accuracy_array = [0, 0, 0]
                    file_accuracy_array = [0, 0, 0]
                    txt_accuracy_array = [0, 0, 0]
                    act_accuracy_array = [0, 0, 0]

                    print("")
                    print(
                        "---------------------------------------------------------------------------"
                    )
                    print("alpha= " + str(i / 10) + " beta= " + str(j / 10) +
                          " gamma= " + str(k / 10))

                    for new_pr in all_prs.collect():
                        total_prs += 1
                        new_pr = PullRequest(new_pr)

                        scores_df = main_df.loc[main_df['new_pr_id'] ==
                                                new_pr.pr_id].copy()

                        # print(new_pr.pr_id)

                        ranked_data_frame = generate_ranked_list(
                            scores_df, i / 10, j / 10, k / 10)

                        combined_accuracy = test_combined_accuracy(
                            ranked_data_frame, new_pr, True, True, True)
                        file_path_accuracy = test_file_path_similarity_accuracy(
                            ranked_data_frame, new_pr, True, True, True)
                        text_accuracy = test_text_similarity_accuracy(
                            ranked_data_frame, new_pr, True, True, True)
                        activeness_accuracy = test_activeness_accuracy(
                            ranked_data_frame, new_pr, True, True, True)

                        if hasattr(combined_accuracy,
                                   'top1') and combined_accuracy.top1:
                            cmb_accuracy_array[0] += 1
                        if hasattr(combined_accuracy,
                                   'top3') and combined_accuracy.top3:
                            cmb_accuracy_array[1] += 1
                        if hasattr(combined_accuracy,
                                   'top5') and combined_accuracy.top5:
                            cmb_accuracy_array[2] += 1

                        if hasattr(file_path_accuracy,
                                   'top1') and file_path_accuracy.top1:
                            file_accuracy_array[0] += 1
                        if hasattr(file_path_accuracy,
                                   'top3') and file_path_accuracy.top3:
                            file_accuracy_array[1] += 1
                        if hasattr(file_path_accuracy,
                                   'top5') and file_path_accuracy.top5:
                            file_accuracy_array[2] += 1

                        if hasattr(text_accuracy,
                                   'top1') and text_accuracy.top1:
                            txt_accuracy_array[0] += 1
                        if hasattr(text_accuracy,
                                   'top3') and text_accuracy.top3:
                            txt_accuracy_array[1] += 1
                        if hasattr(text_accuracy,
                                   'top5') and text_accuracy.top5:
                            txt_accuracy_array[2] += 1

                        if hasattr(activeness_accuracy,
                                   'top1') and activeness_accuracy.top1:
                            act_accuracy_array[0] += 1
                        if hasattr(activeness_accuracy,
                                   'top3') and activeness_accuracy.top3:
                            act_accuracy_array[1] += 1
                        if hasattr(activeness_accuracy,
                                   'top5') and activeness_accuracy.top5:
                            act_accuracy_array[2] += 1

                    avg_combined_top1_accuracy = cmb_accuracy_array[
                        0] / total_prs
                    avg_combined_top3_accuracy = cmb_accuracy_array[
                        1] / total_prs
                    avg_combined_top5_accuracy = cmb_accuracy_array[
                        2] / total_prs

                    avg_file_path_top1_accuracy = file_accuracy_array[
                        0] / total_prs
                    avg_file_path_top3_accuracy = file_accuracy_array[
                        1] / total_prs
                    avg_file_path_top5_accuracy = file_accuracy_array[
                        2] / total_prs

                    avg_text_top1_accuracy = txt_accuracy_array[0] / total_prs
                    avg_text_top3_accuracy = txt_accuracy_array[1] / total_prs
                    avg_text_top5_accuracy = txt_accuracy_array[2] / total_prs

                    avg_act_top1_accuracy = act_accuracy_array[0] / total_prs
                    avg_act_top3_accuracy = act_accuracy_array[1] / total_prs
                    avg_act_top5_accuracy = act_accuracy_array[2] / total_prs

                    print(
                        "---------------------------------------------------------------------------"
                    )
                    print(
                        "                         Top1          Top3            Top5"
                    )
                    print("Combined Accuracy         " +
                          str(avg_combined_top1_accuracy) + "          " +
                          str(avg_combined_top3_accuracy) + "         " +
                          str(avg_combined_top5_accuracy))
                    if flag:
                        print("File Path Accuracy        " +
                              str(avg_file_path_top1_accuracy) + "          " +
                              str(avg_file_path_top3_accuracy) + "         " +
                              str(avg_file_path_top5_accuracy))
                        print("Text Accuracy             " +
                              str(avg_text_top1_accuracy) + "          " +
                              str(avg_text_top3_accuracy) + "         " +
                              str(avg_text_top5_accuracy))
                        print("Activeness Accuracy       " +
                              str(avg_act_top1_accuracy) + "          " +
                              str(avg_act_top3_accuracy) + "         " +
                              str(avg_act_top5_accuracy))
                    flag = False