Ejemplo n.º 1
0
    def test_build_cached_report_table(self):
        """Check whether data items are correctly cached and updated/evicted."""
        data_id = "cache_test_data"
        project_name = "project"

        def create_empty_df():
            return pd.DataFrame(columns=["entry"])

        def create_cache_entry_data(entry: str):
            return pd.DataFrame(
                {"entry": entry},
                index=[0]), get_entry_id(entry), get_entry_timestamp(entry)

        def get_entry_id(entry: str) -> str:
            return self.TEST_DATA[entry][0]

        def get_entry_timestamp(entry: str) -> str:
            return str(self.TEST_DATA[entry][1])

        def is_newer_timestamp(ts1: str, ts2: str) -> bool:
            return int(ts1) > int(ts2)

        # initialize cache with a,b,c
        df = build_cached_report_table(data_id, project_name, ["a", "b", "c"],
                                       [], create_empty_df,
                                       create_cache_entry_data, get_entry_id,
                                       get_entry_timestamp, is_newer_timestamp)

        self.assertIn("a", df["entry"].values)
        self.assertIn("b", df["entry"].values)
        self.assertIn("c", df["entry"].values)

        # update c -> c2 and "update" b -> b
        df = build_cached_report_table(data_id, project_name, ["b", "c2"], [],
                                       create_empty_df,
                                       create_cache_entry_data, get_entry_id,
                                       get_entry_timestamp, is_newer_timestamp)

        self.assertIn("a", df["entry"].values)
        self.assertIn("b", df["entry"].values)
        self.assertNotIn("c", df["entry"].values)
        self.assertIn("c2", df["entry"].values)

        # delete a via a2
        df = build_cached_report_table(data_id, project_name, [], ["a"],
                                       create_empty_df,
                                       create_cache_entry_data, get_entry_id,
                                       get_entry_timestamp, is_newer_timestamp)

        self.assertNotIn("a2", df["entry"].values)
        self.assertIn("b", df["entry"].values)
        self.assertIn("c2", df["entry"].values)
Ejemplo n.º 2
0
    def _load_dataframe(cls, project_name: str, commit_map: CommitMap,
                        case_study: tp.Optional[CaseStudy],
                        **kwargs: tp.Any) -> pd.DataFrame:
        def create_dataframe_layout() -> pd.DataFrame:
            df_layout = pd.DataFrame(columns=cls.COLUMNS)
            df_layout = df_layout.astype(cls.COLUMN_TYPES)
            return df_layout

        def create_data_frame_for_report(
                report_path: Path) -> tp.Tuple[pd.DataFrame, str, str]:
            report = load_commit_report(report_path)
            cf_head_interactions_raw = report.number_of_head_cf_interactions()
            df_head_interactions_raw = report.number_of_head_df_interactions()

            return pd.DataFrame(
                {
                    'revision':
                    report.head_commit.hash,
                    'time_id':
                    commit_map.short_time_id(report.head_commit),
                    'CFInteractions':
                    report.number_of_cf_interactions(),
                    'DFInteractions':
                    report.number_of_df_interactions(),
                    'HEAD CF Interactions':
                    cf_head_interactions_raw[0] + cf_head_interactions_raw[1],
                    'HEAD DF Interactions':
                    df_head_interactions_raw[0] + df_head_interactions_raw[1]
                },
                index=[0]), report.head_commit.hash, str(
                    report_path.stat().st_mtime_ns)

        report_files = get_processed_revisions_files(
            project_name, CommitReport,
            get_case_study_file_name_filter(case_study))

        failed_report_files = get_failed_revisions_files(
            project_name, CommitReport,
            get_case_study_file_name_filter(case_study))

        # cls.CACHE_ID is set by superclass
        # pylint: disable=E1101
        data_frame = build_cached_report_table(
            cls.CACHE_ID, project_name, report_files, failed_report_files,
            create_dataframe_layout, create_data_frame_for_report,
            lambda path: ReportFilename(path).commit_hash.hash,
            lambda path: str(path.stat().st_mtime_ns),
            lambda a, b: int(a) > int(b))

        return data_frame
    def _load_dataframe(cls, project_name: str, commit_map: CommitMap,
                        case_study: tp.Optional[CaseStudy],
                        **kwargs: tp.Any) -> pd.DataFrame:
        def create_dataframe_layout() -> pd.DataFrame:
            df_layout = pd.DataFrame(columns=cls.COLUMNS)
            df_layout = df_layout.astype(cls.COLUMN_TYPES)
            return df_layout

        def create_data_frame_for_report(
                report_path: Path) -> tp.Tuple[pd.DataFrame, str, str]:
            report = load_blame_report(report_path)
            base_inter_c_repo_pair_mapping = \
                gen_base_to_inter_commit_repo_pair_mapping(report)

            def build_dataframe_row(base_hash: FullCommitHash,
                                    base_library: str,
                                    inter_hash: FullCommitHash,
                                    inter_library: str,
                                    amount: int) -> tp.Dict[str, tp.Any]:

                data_dict: tp.Dict[str, tp.Any] = {
                    'revision': report.head_commit.hash,
                    'time_id': commit_map.short_time_id(report.head_commit),
                    'base_hash': base_hash.hash,
                    'base_lib': base_library,
                    'inter_hash': inter_hash.hash,
                    'inter_lib': inter_library,
                    'amount': amount
                }
                return data_dict

            result_data_dicts: tp.List[tp.Dict[str, tp.Any]] = []

            for base_pair in base_inter_c_repo_pair_mapping:
                inter_pair_amount_dict = base_inter_c_repo_pair_mapping[
                    base_pair]

                for inter_pair in inter_pair_amount_dict:
                    result_data_dicts.append(
                        build_dataframe_row(
                            base_hash=base_pair.commit.commit_hash,
                            base_library=base_pair.commit.repository_name,
                            inter_hash=inter_pair.commit.commit_hash,
                            inter_library=inter_pair.commit.repository_name,
                            amount=inter_pair_amount_dict[inter_pair]))

            return pd.DataFrame(
                result_data_dicts), report.head_commit.hash, str(
                    report_path.stat().st_mtime_ns)

        report_files = get_processed_revisions_files(
            project_name, BlameReport,
            get_case_study_file_name_filter(case_study))

        failed_report_files = get_failed_revisions_files(
            project_name, BlameReport,
            get_case_study_file_name_filter(case_study))

        # cls.CACHE_ID is set by superclass
        # pylint: disable=E1101
        data_frame = build_cached_report_table(
            cls.CACHE_ID, project_name, report_files, failed_report_files,
            create_dataframe_layout, create_data_frame_for_report,
            lambda path: ReportFilename(path).commit_hash.hash,
            lambda path: str(path.stat().st_mtime_ns),
            lambda a, b: int(a) > int(b))

        return data_frame
    def _load_dataframe(cls, project_name: str, commit_map: CommitMap,
                        case_study: tp.Optional[CaseStudy],
                        **kwargs: tp.Any) -> pd.DataFrame:
        def create_dataframe_layout() -> pd.DataFrame:
            df_layout = pd.DataFrame(columns=cls.COLUMNS)
            df_layout = df_layout.astype(cls.COLUMN_TYPES)
            return df_layout

        def create_data_frame_for_report(
            report_paths: tp.Tuple[Path, Path]
        ) -> tp.Tuple[pd.DataFrame, str, str]:

            head_report = load_blame_report(report_paths[0])
            pred_report = load_blame_report(report_paths[1])

            diff_report = BlameReportDiff(head_report, pred_report)

            base_inter_c_repo_pair_mapping = \
                gen_base_to_inter_commit_repo_pair_mapping(
                diff_report
            )

            def build_dataframe_row(base_hash: FullCommitHash,
                                    base_library: str,
                                    inter_hash: FullCommitHash,
                                    inter_library: str,
                                    amount: int) -> tp.Dict[str, tp.Any]:

                data_dict: tp.Dict[str, tp.Any] = {
                    'revision': head_report.head_commit.hash,
                    'time_id':
                    commit_map.short_time_id(head_report.head_commit),
                    'base_hash': base_hash.hash,
                    'base_lib': base_library,
                    'inter_hash': inter_hash.hash,
                    'inter_lib': inter_library,
                    'amount': amount
                }
                return data_dict

            result_data_dicts: tp.List[tp.Dict[str, tp.Any]] = []

            for base_pair in base_inter_c_repo_pair_mapping:
                inter_pair_amount_dict = base_inter_c_repo_pair_mapping[
                    base_pair]

                for inter_pair in inter_pair_amount_dict:
                    result_data_dicts.append(
                        build_dataframe_row(
                            base_hash=base_pair.commit.commit_hash,
                            base_library=base_pair.commit.repository_name,
                            inter_hash=inter_pair.commit.commit_hash,
                            inter_library=inter_pair.commit.repository_name,
                            amount=inter_pair_amount_dict[inter_pair]))

            return (pd.DataFrame(result_data_dicts),
                    id_from_paths(report_paths),
                    timestamp_from_paths(report_paths))

        report_pairs, failed_report_pairs = build_report_pairs_tuple(
            project_name, commit_map, case_study)

        # cls.CACHE_ID is set by superclass
        # pylint: disable=E1101
        data_frame = build_cached_report_table(
            cls.CACHE_ID, project_name, report_pairs, failed_report_pairs,
            create_dataframe_layout, create_data_frame_for_report,
            id_from_paths, timestamp_from_paths, compare_timestamps)

        return data_frame
    def _load_dataframe(cls, project_name: str, commit_map: CommitMap,
                        case_study: tp.Optional[CaseStudy],
                        **kwargs: tp.Any) -> pd.DataFrame:
        # pylint: disable=unused-argument

        def create_dataframe_layout() -> pd.DataFrame:
            df_layout = pd.DataFrame(columns=cls.COLUMNS)
            df_layout = df_layout.astype(cls.COLUMN_TYPES)
            return df_layout

        def create_data_frame_for_report(
                report_path: Path) -> tp.Tuple[pd.DataFrame, str, str]:

            report_file_name_match = re.search(
                BlameVerifierReportDatabase.report_file_name_pattern,
                str(report_path))

            if report_file_name_match:
                report_file_name = report_file_name_match.group()
            else:
                raise RuntimeWarning(
                    "report file name could not be read from report path")

            report: tp.Union[BlameVerifierReportOpt,
                             BlameVerifierReportNoOptTBAA]

            if BlameVerifierReportOpt.is_correct_report_type(report_file_name):
                report_opt = load_blame_verifier_report_opt(report_path)
                report = report_opt
                opt_level = OptLevel.OPT.value

            elif BlameVerifierReportNoOptTBAA.is_correct_report_type(
                    report_file_name):
                report_no_opt = load_blame_verifier_report_no_opt_tbaa(
                    report_path)

                report = report_no_opt
                opt_level = OptLevel.NO_OPT.value

            else:
                raise RuntimeWarning("unknown report type")

            number_of_total_annotations = report.get_total_annotations()
            number_of_successful_annotations = \
                report.get_successful_annotations()
            number_of_failed_annotations = report.get_failed_annotations()
            number_of_undetermined_annotations \
                = report.get_undetermined_annotations()

            return pd.DataFrame(
                {
                    'revision': report.head_commit.hash,
                    'time_id': commit_map.short_time_id(report.head_commit),
                    'opt_level': opt_level,
                    'total': number_of_total_annotations,
                    'successful': number_of_successful_annotations,
                    'failed': number_of_failed_annotations,
                    'undetermined': number_of_undetermined_annotations
                },
                index=[0]
                # Add prefix of report name to head_commit to differentiate
                # between reports with and without optimization
            ), report.head_commit.hash + report_path.name.split(
                "-", 1)[0], str(report_path.stat().st_mtime_ns)

        report_files_opt = get_processed_revisions_files(
            project_name, BlameVerifierReportOpt,
            get_case_study_file_name_filter(case_study))

        report_files_no_opt = get_processed_revisions_files(
            project_name, BlameVerifierReportNoOptTBAA,
            get_case_study_file_name_filter(case_study))

        report_files = report_files_opt + report_files_no_opt

        failed_report_files_opt = get_failed_revisions_files(
            project_name, BlameVerifierReportOpt,
            get_case_study_file_name_filter(case_study))

        failed_report_files_no_opt = get_failed_revisions_files(
            project_name, BlameVerifierReportNoOptTBAA,
            get_case_study_file_name_filter(case_study))

        failed_report_files = \
            failed_report_files_opt + failed_report_files_no_opt

        # cls.CACHE_ID is set by superclass
        # pylint: disable=E1101
        data_frame = build_cached_report_table(
            cls.CACHE_ID, project_name, report_files, failed_report_files,
            create_dataframe_layout, create_data_frame_for_report, lambda path:
            ReportFilename(path).commit_hash.hash + path.name.split("-", 1)[0],
            lambda path: str(path.stat().st_mtime_ns),
            lambda a, b: int(a) > int(b))
        return data_frame
    def _load_dataframe(
        cls, project_name: str, commit_map: CommitMap,
        case_study: tp.Optional[CaseStudy], **kwargs: tp.Any
    ) -> pd.DataFrame:
        commit_lookup = create_commit_lookup_helper(project_name)

        def create_dataframe_layout() -> pd.DataFrame:
            df_layout = pd.DataFrame(columns=cls.COLUMNS)
            df_layout = df_layout.astype(cls.COLUMN_TYPES)
            return df_layout

        def create_data_frame_for_report(
            report_path: Path
        ) -> tp.Tuple[pd.DataFrame, str, str]:
            report = load_blame_report(report_path)

            categorised_degree_occurrences = generate_lib_dependent_degrees(
                report
            )

            def calc_total_amounts() -> int:
                total = 0

                for _, lib_dict in categorised_degree_occurrences.items():
                    for _, tuple_list in lib_dict.items():
                        for degree_amount_tuple in tuple_list:
                            total += degree_amount_tuple[1]
                return total

            total_amounts_of_all_libs = calc_total_amounts()

            list_of_author_degree_occurrences = generate_author_degree_tuples(
                report, commit_lookup
            )
            author_degrees, author_amounts = _split_tuple_values_in_lists_tuple(
                list_of_author_degree_occurrences
            )
            author_total = sum(author_amounts)

            list_of_max_time_deltas = generate_max_time_distribution_tuples(
                report, commit_lookup, MAX_TIME_BUCKET_SIZE
            )
            (max_time_buckets, max_time_amounts
            ) = _split_tuple_values_in_lists_tuple(list_of_max_time_deltas)
            total_max_time_amounts = sum(max_time_amounts)

            list_of_avg_time_deltas = generate_avg_time_distribution_tuples(
                report, commit_lookup, AVG_TIME_BUCKET_SIZE
            )
            (avg_time_buckets, avg_time_amounts
            ) = _split_tuple_values_in_lists_tuple(list_of_avg_time_deltas)
            total_avg_time_amounts = sum(avg_time_amounts)

            def build_dataframe_row(
                degree_type: DegreeType,
                degree: int,
                amount: int,
                total_amount: int,
                base_library: tp.Optional[str] = None,
                inter_library: tp.Optional[str] = None
            ) -> tp.Dict[str, tp.Any]:

                data_dict: tp.Dict[str, tp.Any] = {
                    'revision': report.head_commit.hash,
                    'time_id': commit_map.short_time_id(report.head_commit),
                    'degree_type': degree_type.value,
                    'base_lib': base_library,
                    'inter_lib': inter_library,
                    'degree': degree,
                    'amount': amount,
                    'fraction': np.divide(amount, total_amount)
                }
                return data_dict

            result_data_dicts: tp.List[tp.Dict[str, tp.Any]] = []

            # Append interaction rows
            for base_lib_name, inter_lib_dict \
                    in categorised_degree_occurrences.items():

                for inter_lib_name, list_of_lib_degree_amount_tuples in \
                        inter_lib_dict.items():

                    (inter_degrees,
                     inter_amounts) = _split_tuple_values_in_lists_tuple(
                         list_of_lib_degree_amount_tuples
                     )

                    for i, _ in enumerate(inter_degrees):
                        degree = inter_degrees[i]
                        lib_amount = inter_amounts[i]

                        interaction_data_dict = build_dataframe_row(
                            degree_type=DegreeType.INTERACTION,
                            degree=degree,
                            amount=lib_amount,
                            total_amount=total_amounts_of_all_libs,
                            base_library=base_lib_name,
                            inter_library=inter_lib_name,
                        )
                        result_data_dicts.append(interaction_data_dict)

            def append_rows_of_degree_type(
                degree_type: DegreeType,
                degrees: tp.List[int],
                amounts: tp.List[int],
                sum_amounts: int,
            ) -> None:
                for k, _ in enumerate(degrees):
                    data_dict = build_dataframe_row(
                        degree_type=degree_type,
                        degree=degrees[k],
                        amount=amounts[k],
                        total_amount=sum_amounts
                    )
                    result_data_dicts.append(data_dict)

            # Append author rows
            append_rows_of_degree_type(
                degree_type=DegreeType.AUTHOR,
                degrees=author_degrees,
                amounts=author_amounts,
                sum_amounts=author_total
            )

            # Append max_time rows
            append_rows_of_degree_type(
                degree_type=DegreeType.MAX_TIME,
                degrees=max_time_buckets,
                amounts=max_time_amounts,
                sum_amounts=total_max_time_amounts
            )

            # Append avg_time rows
            append_rows_of_degree_type(
                degree_type=DegreeType.AVG_TIME,
                degrees=avg_time_buckets,
                amounts=avg_time_amounts,
                sum_amounts=total_avg_time_amounts
            )

            return pd.DataFrame(result_data_dicts
                               ), report.head_commit.hash, str(
                                   report_path.stat().st_mtime_ns
                               )

        report_files = get_processed_revisions_files(
            project_name, BlameReport,
            get_case_study_file_name_filter(case_study)
        )

        failed_report_files = get_failed_revisions_files(
            project_name, BlameReport,
            get_case_study_file_name_filter(case_study)
        )

        # cls.CACHE_ID is set by superclass
        # pylint: disable=E1101
        data_frame = build_cached_report_table(
            cls.CACHE_ID, project_name, report_files, failed_report_files,
            create_dataframe_layout, create_data_frame_for_report,
            lambda path: ReportFilename(path).commit_hash.hash,
            lambda path: str(path.stat().st_mtime_ns),
            lambda a, b: int(a) > int(b)
        )

        return data_frame
    def _load_dataframe(cls, project_name: str, commit_map: CommitMap,
                        case_study: tp.Optional[CaseStudy],
                        **kwargs: tp.Any) -> pd.DataFrame:
        repo = get_local_project_git(project_name)
        commit_lookup = create_commit_lookup_helper(project_name)

        def create_dataframe_layout() -> pd.DataFrame:
            df_layout = pd.DataFrame(columns=cls.COLUMNS)
            df_layout = df_layout.astype(cls.COLUMN_TYPES)
            return df_layout

        def create_data_frame_for_report(
            report_paths: tp.Tuple[Path, Path]
        ) -> tp.Tuple[pd.DataFrame, str, str]:
            # Look-up commit and infos about the HEAD commit of the report
            head_report = load_blame_report(report_paths[0])
            pred_report = load_blame_report(report_paths[1])
            commit = repo.get(head_report.head_commit.hash)
            commit_date = datetime.utcfromtimestamp(commit.commit_time)
            pred_commit = repo.get(pred_report.head_commit.hash)

            diff_between_head_pred = BlameReportDiff(head_report, pred_report)

            # Calculate the total churn between pred and base commit
            code_churn = calc_code_churn(
                Path(repo.path), FullCommitHash.from_pygit_commit(pred_commit),
                FullCommitHash.from_pygit_commit(commit),
                ChurnConfig.create_c_style_languages_config())
            total_churn = code_churn[1] + code_churn[2]

            def weighted_avg(tuples: tp.List[tp.Tuple[int, int]]) -> float:
                total_sum = 0
                degree_sum = 0
                for degree, amount in tuples:
                    degree_sum += degree
                    total_sum += (degree * amount)

                return total_sum / max(1, degree_sum)

            def combine_max(tuples: tp.List[tp.Tuple[int, int]]) -> float:
                if tuples:
                    return max([x for x, y in tuples])
                return 0

            return (pd.DataFrame(
                {
                    'revision':
                    head_report.head_commit.hash,
                    'time_id':
                    commit_map.short_time_id(head_report.head_commit),
                    'churn':
                    total_churn,
                    'num_interactions':
                    count_interactions(diff_between_head_pred),
                    'num_interacting_commits':
                    count_interacting_commits(diff_between_head_pred),
                    'num_interacting_authors':
                    count_interacting_authors(diff_between_head_pred,
                                              commit_lookup),
                    "ci_degree_mean":
                    weighted_avg(
                        generate_degree_tuples(diff_between_head_pred)),
                    "author_mean":
                    weighted_avg(
                        generate_author_degree_tuples(diff_between_head_pred,
                                                      commit_lookup)),
                    "avg_time_mean":
                    weighted_avg(
                        generate_avg_time_distribution_tuples(
                            diff_between_head_pred, commit_lookup, 1)),
                    "ci_degree_max":
                    combine_max(
                        generate_degree_tuples(diff_between_head_pred)),
                    "author_max":
                    combine_max(
                        generate_author_degree_tuples(diff_between_head_pred,
                                                      commit_lookup)),
                    "avg_time_max":
                    combine_max(
                        generate_max_time_distribution_tuples(
                            diff_between_head_pred, commit_lookup, 1)),
                    'year':
                    commit_date.year,
                },
                index=[0]), id_from_paths(report_paths),
                    timestamp_from_paths(report_paths))

        report_pairs, failed_report_pairs = build_report_pairs_tuple(
            project_name, commit_map, case_study)

        # cls.CACHE_ID is set by superclass
        # pylint: disable=E1101
        data_frame = build_cached_report_table(
            cls.CACHE_ID, project_name, report_pairs, failed_report_pairs,
            create_dataframe_layout, create_data_frame_for_report,
            id_from_paths, timestamp_from_paths, compare_timestamps)

        return data_frame
def _load_dataframe_for_report(
    project_name: str, cache_id: str, columns: tp.List[str],
    commit_map: CommitMap, szz_report: SZZReport
) -> pd.DataFrame:
    commit_lookup = create_commit_lookup_helper(project_name)
    commit_map = get_commit_map(project_name)
    prj_src = get_primary_project_source(project_name)

    def create_dataframe_layout() -> pd.DataFrame:
        df_layout = pd.DataFrame(columns=columns)
        return df_layout

    def create_data_frame_for_report(
        report_paths: tp.Tuple[Path, Path]
    ) -> tp.Tuple[pd.DataFrame, str, str]:
        # Look-up commit and infos about the HEAD commit of the report
        fix_report = load_blame_report(report_paths[0])
        intro_report = load_blame_report(report_paths[1])
        fix_commit = commit_lookup(
            CommitRepoPair(
                commit_map.convert_to_full_or_warn(fix_report.head_commit),
                prj_src.local
            )
        )
        intro_commit = commit_lookup(
            CommitRepoPair(
                commit_map.convert_to_full_or_warn(intro_report.head_commit),
                prj_src.local
            )
        )

        fix_in, fix_out = get_interacting_commits_for_commit(
            fix_report,
            CommitRepoPair(
                FullCommitHash.from_pygit_commit(fix_commit), prj_src.local
            )
        )
        intro_in, intro_out = get_interacting_commits_for_commit(
            intro_report,
            CommitRepoPair(
                FullCommitHash.from_pygit_commit(intro_commit), prj_src.local
            )
        )

        score = _calculate_szz_quality_score(
            fix_in, fix_out, intro_in, intro_out
        )

        return (
            pd.DataFrame({
                'revision': str(fix_report.head_commit),
                'time_id': commit_map.short_time_id(fix_report.head_commit),
                'introducer': str(intro_report.head_commit),
                'score': score
            },
                         index=[0]), id_from_paths(report_paths),
            timestamp_from_paths(report_paths)
        )

    report_map = _get_requested_report_paths(project_name, szz_report)
    available_revisions = report_map.keys()

    new_entries: tp.List[tp.Tuple[Path, Path]] = []
    remove_entries: tp.List[tp.Tuple[Path, Path]] = []
    bugs = szz_report.get_all_raw_bugs()
    for bug in bugs:
        fix = bug.fixing_commit.to_short_commit_hash()
        if fix in available_revisions:
            for introducer in bug.introducing_commits:
                intro = introducer.to_short_commit_hash()
                if intro in available_revisions:
                    new_entries.append((report_map[fix], report_map[intro]))

    # cls.CACHE_ID is set by superclass
    # pylint: disable=E1101
    data_frame = build_cached_report_table(
        cache_id, project_name, new_entries, remove_entries,
        create_dataframe_layout, create_data_frame_for_report, id_from_paths,
        timestamp_from_paths, compare_timestamps
    )

    return data_frame