def test_build_cached_report_table(self): """Check whether data items are correctly cached and updated/evicted.""" data_id = "cache_test_data" project_name = "project" def create_empty_df(): return pd.DataFrame(columns=["entry"]) def create_cache_entry_data(entry: str): return pd.DataFrame( {"entry": entry}, index=[0]), get_entry_id(entry), get_entry_timestamp(entry) def get_entry_id(entry: str) -> str: return self.TEST_DATA[entry][0] def get_entry_timestamp(entry: str) -> str: return str(self.TEST_DATA[entry][1]) def is_newer_timestamp(ts1: str, ts2: str) -> bool: return int(ts1) > int(ts2) # initialize cache with a,b,c df = build_cached_report_table(data_id, project_name, ["a", "b", "c"], [], create_empty_df, create_cache_entry_data, get_entry_id, get_entry_timestamp, is_newer_timestamp) self.assertIn("a", df["entry"].values) self.assertIn("b", df["entry"].values) self.assertIn("c", df["entry"].values) # update c -> c2 and "update" b -> b df = build_cached_report_table(data_id, project_name, ["b", "c2"], [], create_empty_df, create_cache_entry_data, get_entry_id, get_entry_timestamp, is_newer_timestamp) self.assertIn("a", df["entry"].values) self.assertIn("b", df["entry"].values) self.assertNotIn("c", df["entry"].values) self.assertIn("c2", df["entry"].values) # delete a via a2 df = build_cached_report_table(data_id, project_name, [], ["a"], create_empty_df, create_cache_entry_data, get_entry_id, get_entry_timestamp, is_newer_timestamp) self.assertNotIn("a2", df["entry"].values) self.assertIn("b", df["entry"].values) self.assertIn("c2", df["entry"].values)
def _load_dataframe(cls, project_name: str, commit_map: CommitMap, case_study: tp.Optional[CaseStudy], **kwargs: tp.Any) -> pd.DataFrame: def create_dataframe_layout() -> pd.DataFrame: df_layout = pd.DataFrame(columns=cls.COLUMNS) df_layout = df_layout.astype(cls.COLUMN_TYPES) return df_layout def create_data_frame_for_report( report_path: Path) -> tp.Tuple[pd.DataFrame, str, str]: report = load_commit_report(report_path) cf_head_interactions_raw = report.number_of_head_cf_interactions() df_head_interactions_raw = report.number_of_head_df_interactions() return pd.DataFrame( { 'revision': report.head_commit.hash, 'time_id': commit_map.short_time_id(report.head_commit), 'CFInteractions': report.number_of_cf_interactions(), 'DFInteractions': report.number_of_df_interactions(), 'HEAD CF Interactions': cf_head_interactions_raw[0] + cf_head_interactions_raw[1], 'HEAD DF Interactions': df_head_interactions_raw[0] + df_head_interactions_raw[1] }, index=[0]), report.head_commit.hash, str( report_path.stat().st_mtime_ns) report_files = get_processed_revisions_files( project_name, CommitReport, get_case_study_file_name_filter(case_study)) failed_report_files = get_failed_revisions_files( project_name, CommitReport, get_case_study_file_name_filter(case_study)) # cls.CACHE_ID is set by superclass # pylint: disable=E1101 data_frame = build_cached_report_table( cls.CACHE_ID, project_name, report_files, failed_report_files, create_dataframe_layout, create_data_frame_for_report, lambda path: ReportFilename(path).commit_hash.hash, lambda path: str(path.stat().st_mtime_ns), lambda a, b: int(a) > int(b)) return data_frame
def _load_dataframe(cls, project_name: str, commit_map: CommitMap, case_study: tp.Optional[CaseStudy], **kwargs: tp.Any) -> pd.DataFrame: def create_dataframe_layout() -> pd.DataFrame: df_layout = pd.DataFrame(columns=cls.COLUMNS) df_layout = df_layout.astype(cls.COLUMN_TYPES) return df_layout def create_data_frame_for_report( report_path: Path) -> tp.Tuple[pd.DataFrame, str, str]: report = load_blame_report(report_path) base_inter_c_repo_pair_mapping = \ gen_base_to_inter_commit_repo_pair_mapping(report) def build_dataframe_row(base_hash: FullCommitHash, base_library: str, inter_hash: FullCommitHash, inter_library: str, amount: int) -> tp.Dict[str, tp.Any]: data_dict: tp.Dict[str, tp.Any] = { 'revision': report.head_commit.hash, 'time_id': commit_map.short_time_id(report.head_commit), 'base_hash': base_hash.hash, 'base_lib': base_library, 'inter_hash': inter_hash.hash, 'inter_lib': inter_library, 'amount': amount } return data_dict result_data_dicts: tp.List[tp.Dict[str, tp.Any]] = [] for base_pair in base_inter_c_repo_pair_mapping: inter_pair_amount_dict = base_inter_c_repo_pair_mapping[ base_pair] for inter_pair in inter_pair_amount_dict: result_data_dicts.append( build_dataframe_row( base_hash=base_pair.commit.commit_hash, base_library=base_pair.commit.repository_name, inter_hash=inter_pair.commit.commit_hash, inter_library=inter_pair.commit.repository_name, amount=inter_pair_amount_dict[inter_pair])) return pd.DataFrame( result_data_dicts), report.head_commit.hash, str( report_path.stat().st_mtime_ns) report_files = get_processed_revisions_files( project_name, BlameReport, get_case_study_file_name_filter(case_study)) failed_report_files = get_failed_revisions_files( project_name, BlameReport, get_case_study_file_name_filter(case_study)) # cls.CACHE_ID is set by superclass # pylint: disable=E1101 data_frame = build_cached_report_table( cls.CACHE_ID, project_name, report_files, failed_report_files, create_dataframe_layout, create_data_frame_for_report, lambda path: ReportFilename(path).commit_hash.hash, lambda path: str(path.stat().st_mtime_ns), lambda a, b: int(a) > int(b)) return data_frame
def _load_dataframe(cls, project_name: str, commit_map: CommitMap, case_study: tp.Optional[CaseStudy], **kwargs: tp.Any) -> pd.DataFrame: def create_dataframe_layout() -> pd.DataFrame: df_layout = pd.DataFrame(columns=cls.COLUMNS) df_layout = df_layout.astype(cls.COLUMN_TYPES) return df_layout def create_data_frame_for_report( report_paths: tp.Tuple[Path, Path] ) -> tp.Tuple[pd.DataFrame, str, str]: head_report = load_blame_report(report_paths[0]) pred_report = load_blame_report(report_paths[1]) diff_report = BlameReportDiff(head_report, pred_report) base_inter_c_repo_pair_mapping = \ gen_base_to_inter_commit_repo_pair_mapping( diff_report ) def build_dataframe_row(base_hash: FullCommitHash, base_library: str, inter_hash: FullCommitHash, inter_library: str, amount: int) -> tp.Dict[str, tp.Any]: data_dict: tp.Dict[str, tp.Any] = { 'revision': head_report.head_commit.hash, 'time_id': commit_map.short_time_id(head_report.head_commit), 'base_hash': base_hash.hash, 'base_lib': base_library, 'inter_hash': inter_hash.hash, 'inter_lib': inter_library, 'amount': amount } return data_dict result_data_dicts: tp.List[tp.Dict[str, tp.Any]] = [] for base_pair in base_inter_c_repo_pair_mapping: inter_pair_amount_dict = base_inter_c_repo_pair_mapping[ base_pair] for inter_pair in inter_pair_amount_dict: result_data_dicts.append( build_dataframe_row( base_hash=base_pair.commit.commit_hash, base_library=base_pair.commit.repository_name, inter_hash=inter_pair.commit.commit_hash, inter_library=inter_pair.commit.repository_name, amount=inter_pair_amount_dict[inter_pair])) return (pd.DataFrame(result_data_dicts), id_from_paths(report_paths), timestamp_from_paths(report_paths)) report_pairs, failed_report_pairs = build_report_pairs_tuple( project_name, commit_map, case_study) # cls.CACHE_ID is set by superclass # pylint: disable=E1101 data_frame = build_cached_report_table( cls.CACHE_ID, project_name, report_pairs, failed_report_pairs, create_dataframe_layout, create_data_frame_for_report, id_from_paths, timestamp_from_paths, compare_timestamps) return data_frame
def _load_dataframe(cls, project_name: str, commit_map: CommitMap, case_study: tp.Optional[CaseStudy], **kwargs: tp.Any) -> pd.DataFrame: # pylint: disable=unused-argument def create_dataframe_layout() -> pd.DataFrame: df_layout = pd.DataFrame(columns=cls.COLUMNS) df_layout = df_layout.astype(cls.COLUMN_TYPES) return df_layout def create_data_frame_for_report( report_path: Path) -> tp.Tuple[pd.DataFrame, str, str]: report_file_name_match = re.search( BlameVerifierReportDatabase.report_file_name_pattern, str(report_path)) if report_file_name_match: report_file_name = report_file_name_match.group() else: raise RuntimeWarning( "report file name could not be read from report path") report: tp.Union[BlameVerifierReportOpt, BlameVerifierReportNoOptTBAA] if BlameVerifierReportOpt.is_correct_report_type(report_file_name): report_opt = load_blame_verifier_report_opt(report_path) report = report_opt opt_level = OptLevel.OPT.value elif BlameVerifierReportNoOptTBAA.is_correct_report_type( report_file_name): report_no_opt = load_blame_verifier_report_no_opt_tbaa( report_path) report = report_no_opt opt_level = OptLevel.NO_OPT.value else: raise RuntimeWarning("unknown report type") number_of_total_annotations = report.get_total_annotations() number_of_successful_annotations = \ report.get_successful_annotations() number_of_failed_annotations = report.get_failed_annotations() number_of_undetermined_annotations \ = report.get_undetermined_annotations() return pd.DataFrame( { 'revision': report.head_commit.hash, 'time_id': commit_map.short_time_id(report.head_commit), 'opt_level': opt_level, 'total': number_of_total_annotations, 'successful': number_of_successful_annotations, 'failed': number_of_failed_annotations, 'undetermined': number_of_undetermined_annotations }, index=[0] # Add prefix of report name to head_commit to differentiate # between reports with and without optimization ), report.head_commit.hash + report_path.name.split( "-", 1)[0], str(report_path.stat().st_mtime_ns) report_files_opt = get_processed_revisions_files( project_name, BlameVerifierReportOpt, get_case_study_file_name_filter(case_study)) report_files_no_opt = get_processed_revisions_files( project_name, BlameVerifierReportNoOptTBAA, get_case_study_file_name_filter(case_study)) report_files = report_files_opt + report_files_no_opt failed_report_files_opt = get_failed_revisions_files( project_name, BlameVerifierReportOpt, get_case_study_file_name_filter(case_study)) failed_report_files_no_opt = get_failed_revisions_files( project_name, BlameVerifierReportNoOptTBAA, get_case_study_file_name_filter(case_study)) failed_report_files = \ failed_report_files_opt + failed_report_files_no_opt # cls.CACHE_ID is set by superclass # pylint: disable=E1101 data_frame = build_cached_report_table( cls.CACHE_ID, project_name, report_files, failed_report_files, create_dataframe_layout, create_data_frame_for_report, lambda path: ReportFilename(path).commit_hash.hash + path.name.split("-", 1)[0], lambda path: str(path.stat().st_mtime_ns), lambda a, b: int(a) > int(b)) return data_frame
def _load_dataframe( cls, project_name: str, commit_map: CommitMap, case_study: tp.Optional[CaseStudy], **kwargs: tp.Any ) -> pd.DataFrame: commit_lookup = create_commit_lookup_helper(project_name) def create_dataframe_layout() -> pd.DataFrame: df_layout = pd.DataFrame(columns=cls.COLUMNS) df_layout = df_layout.astype(cls.COLUMN_TYPES) return df_layout def create_data_frame_for_report( report_path: Path ) -> tp.Tuple[pd.DataFrame, str, str]: report = load_blame_report(report_path) categorised_degree_occurrences = generate_lib_dependent_degrees( report ) def calc_total_amounts() -> int: total = 0 for _, lib_dict in categorised_degree_occurrences.items(): for _, tuple_list in lib_dict.items(): for degree_amount_tuple in tuple_list: total += degree_amount_tuple[1] return total total_amounts_of_all_libs = calc_total_amounts() list_of_author_degree_occurrences = generate_author_degree_tuples( report, commit_lookup ) author_degrees, author_amounts = _split_tuple_values_in_lists_tuple( list_of_author_degree_occurrences ) author_total = sum(author_amounts) list_of_max_time_deltas = generate_max_time_distribution_tuples( report, commit_lookup, MAX_TIME_BUCKET_SIZE ) (max_time_buckets, max_time_amounts ) = _split_tuple_values_in_lists_tuple(list_of_max_time_deltas) total_max_time_amounts = sum(max_time_amounts) list_of_avg_time_deltas = generate_avg_time_distribution_tuples( report, commit_lookup, AVG_TIME_BUCKET_SIZE ) (avg_time_buckets, avg_time_amounts ) = _split_tuple_values_in_lists_tuple(list_of_avg_time_deltas) total_avg_time_amounts = sum(avg_time_amounts) def build_dataframe_row( degree_type: DegreeType, degree: int, amount: int, total_amount: int, base_library: tp.Optional[str] = None, inter_library: tp.Optional[str] = None ) -> tp.Dict[str, tp.Any]: data_dict: tp.Dict[str, tp.Any] = { 'revision': report.head_commit.hash, 'time_id': commit_map.short_time_id(report.head_commit), 'degree_type': degree_type.value, 'base_lib': base_library, 'inter_lib': inter_library, 'degree': degree, 'amount': amount, 'fraction': np.divide(amount, total_amount) } return data_dict result_data_dicts: tp.List[tp.Dict[str, tp.Any]] = [] # Append interaction rows for base_lib_name, inter_lib_dict \ in categorised_degree_occurrences.items(): for inter_lib_name, list_of_lib_degree_amount_tuples in \ inter_lib_dict.items(): (inter_degrees, inter_amounts) = _split_tuple_values_in_lists_tuple( list_of_lib_degree_amount_tuples ) for i, _ in enumerate(inter_degrees): degree = inter_degrees[i] lib_amount = inter_amounts[i] interaction_data_dict = build_dataframe_row( degree_type=DegreeType.INTERACTION, degree=degree, amount=lib_amount, total_amount=total_amounts_of_all_libs, base_library=base_lib_name, inter_library=inter_lib_name, ) result_data_dicts.append(interaction_data_dict) def append_rows_of_degree_type( degree_type: DegreeType, degrees: tp.List[int], amounts: tp.List[int], sum_amounts: int, ) -> None: for k, _ in enumerate(degrees): data_dict = build_dataframe_row( degree_type=degree_type, degree=degrees[k], amount=amounts[k], total_amount=sum_amounts ) result_data_dicts.append(data_dict) # Append author rows append_rows_of_degree_type( degree_type=DegreeType.AUTHOR, degrees=author_degrees, amounts=author_amounts, sum_amounts=author_total ) # Append max_time rows append_rows_of_degree_type( degree_type=DegreeType.MAX_TIME, degrees=max_time_buckets, amounts=max_time_amounts, sum_amounts=total_max_time_amounts ) # Append avg_time rows append_rows_of_degree_type( degree_type=DegreeType.AVG_TIME, degrees=avg_time_buckets, amounts=avg_time_amounts, sum_amounts=total_avg_time_amounts ) return pd.DataFrame(result_data_dicts ), report.head_commit.hash, str( report_path.stat().st_mtime_ns ) report_files = get_processed_revisions_files( project_name, BlameReport, get_case_study_file_name_filter(case_study) ) failed_report_files = get_failed_revisions_files( project_name, BlameReport, get_case_study_file_name_filter(case_study) ) # cls.CACHE_ID is set by superclass # pylint: disable=E1101 data_frame = build_cached_report_table( cls.CACHE_ID, project_name, report_files, failed_report_files, create_dataframe_layout, create_data_frame_for_report, lambda path: ReportFilename(path).commit_hash.hash, lambda path: str(path.stat().st_mtime_ns), lambda a, b: int(a) > int(b) ) return data_frame
def _load_dataframe(cls, project_name: str, commit_map: CommitMap, case_study: tp.Optional[CaseStudy], **kwargs: tp.Any) -> pd.DataFrame: repo = get_local_project_git(project_name) commit_lookup = create_commit_lookup_helper(project_name) def create_dataframe_layout() -> pd.DataFrame: df_layout = pd.DataFrame(columns=cls.COLUMNS) df_layout = df_layout.astype(cls.COLUMN_TYPES) return df_layout def create_data_frame_for_report( report_paths: tp.Tuple[Path, Path] ) -> tp.Tuple[pd.DataFrame, str, str]: # Look-up commit and infos about the HEAD commit of the report head_report = load_blame_report(report_paths[0]) pred_report = load_blame_report(report_paths[1]) commit = repo.get(head_report.head_commit.hash) commit_date = datetime.utcfromtimestamp(commit.commit_time) pred_commit = repo.get(pred_report.head_commit.hash) diff_between_head_pred = BlameReportDiff(head_report, pred_report) # Calculate the total churn between pred and base commit code_churn = calc_code_churn( Path(repo.path), FullCommitHash.from_pygit_commit(pred_commit), FullCommitHash.from_pygit_commit(commit), ChurnConfig.create_c_style_languages_config()) total_churn = code_churn[1] + code_churn[2] def weighted_avg(tuples: tp.List[tp.Tuple[int, int]]) -> float: total_sum = 0 degree_sum = 0 for degree, amount in tuples: degree_sum += degree total_sum += (degree * amount) return total_sum / max(1, degree_sum) def combine_max(tuples: tp.List[tp.Tuple[int, int]]) -> float: if tuples: return max([x for x, y in tuples]) return 0 return (pd.DataFrame( { 'revision': head_report.head_commit.hash, 'time_id': commit_map.short_time_id(head_report.head_commit), 'churn': total_churn, 'num_interactions': count_interactions(diff_between_head_pred), 'num_interacting_commits': count_interacting_commits(diff_between_head_pred), 'num_interacting_authors': count_interacting_authors(diff_between_head_pred, commit_lookup), "ci_degree_mean": weighted_avg( generate_degree_tuples(diff_between_head_pred)), "author_mean": weighted_avg( generate_author_degree_tuples(diff_between_head_pred, commit_lookup)), "avg_time_mean": weighted_avg( generate_avg_time_distribution_tuples( diff_between_head_pred, commit_lookup, 1)), "ci_degree_max": combine_max( generate_degree_tuples(diff_between_head_pred)), "author_max": combine_max( generate_author_degree_tuples(diff_between_head_pred, commit_lookup)), "avg_time_max": combine_max( generate_max_time_distribution_tuples( diff_between_head_pred, commit_lookup, 1)), 'year': commit_date.year, }, index=[0]), id_from_paths(report_paths), timestamp_from_paths(report_paths)) report_pairs, failed_report_pairs = build_report_pairs_tuple( project_name, commit_map, case_study) # cls.CACHE_ID is set by superclass # pylint: disable=E1101 data_frame = build_cached_report_table( cls.CACHE_ID, project_name, report_pairs, failed_report_pairs, create_dataframe_layout, create_data_frame_for_report, id_from_paths, timestamp_from_paths, compare_timestamps) return data_frame
def _load_dataframe_for_report( project_name: str, cache_id: str, columns: tp.List[str], commit_map: CommitMap, szz_report: SZZReport ) -> pd.DataFrame: commit_lookup = create_commit_lookup_helper(project_name) commit_map = get_commit_map(project_name) prj_src = get_primary_project_source(project_name) def create_dataframe_layout() -> pd.DataFrame: df_layout = pd.DataFrame(columns=columns) return df_layout def create_data_frame_for_report( report_paths: tp.Tuple[Path, Path] ) -> tp.Tuple[pd.DataFrame, str, str]: # Look-up commit and infos about the HEAD commit of the report fix_report = load_blame_report(report_paths[0]) intro_report = load_blame_report(report_paths[1]) fix_commit = commit_lookup( CommitRepoPair( commit_map.convert_to_full_or_warn(fix_report.head_commit), prj_src.local ) ) intro_commit = commit_lookup( CommitRepoPair( commit_map.convert_to_full_or_warn(intro_report.head_commit), prj_src.local ) ) fix_in, fix_out = get_interacting_commits_for_commit( fix_report, CommitRepoPair( FullCommitHash.from_pygit_commit(fix_commit), prj_src.local ) ) intro_in, intro_out = get_interacting_commits_for_commit( intro_report, CommitRepoPair( FullCommitHash.from_pygit_commit(intro_commit), prj_src.local ) ) score = _calculate_szz_quality_score( fix_in, fix_out, intro_in, intro_out ) return ( pd.DataFrame({ 'revision': str(fix_report.head_commit), 'time_id': commit_map.short_time_id(fix_report.head_commit), 'introducer': str(intro_report.head_commit), 'score': score }, index=[0]), id_from_paths(report_paths), timestamp_from_paths(report_paths) ) report_map = _get_requested_report_paths(project_name, szz_report) available_revisions = report_map.keys() new_entries: tp.List[tp.Tuple[Path, Path]] = [] remove_entries: tp.List[tp.Tuple[Path, Path]] = [] bugs = szz_report.get_all_raw_bugs() for bug in bugs: fix = bug.fixing_commit.to_short_commit_hash() if fix in available_revisions: for introducer in bug.introducing_commits: intro = introducer.to_short_commit_hash() if intro in available_revisions: new_entries.append((report_map[fix], report_map[intro])) # cls.CACHE_ID is set by superclass # pylint: disable=E1101 data_frame = build_cached_report_table( cache_id, project_name, new_entries, remove_entries, create_dataframe_layout, create_data_frame_for_report, id_from_paths, timestamp_from_paths, compare_timestamps ) return data_frame