def test_extensions_repr_gen(self): c_config = ChurnConfig.create_c_language_config() self.assertEqual(c_config.get_extensions_repr(), ["c", "h"]) self.assertEqual(c_config.get_extensions_repr(prefix="*."), ["*.c", "*.h"]) self.assertEqual(c_config.get_extensions_repr(suffix="|"), ["c|", "h|"]) c_style_config = ChurnConfig.create_c_style_languages_config() self.assertEqual(c_style_config.get_extensions_repr(), ["c", "cpp", "cxx", "h", "hpp", "hxx"]) self.assertEqual(c_style_config.get_extensions_repr(prefix="*."), ["*.c", "*.cpp", "*.cxx", "*.h", "*.hpp", "*.hxx"]) self.assertEqual(c_style_config.get_extensions_repr(suffix="|"), ["c|", "cpp|", "cxx|", "h|", "hpp|", "hxx|"])
def test_enable_language(self): init_config = ChurnConfig.create_default_config() self.assertFalse(init_config.is_enabled('c')) init_config.enable_language(ChurnConfig.Language.CPP) self.assertFalse(init_config.is_enabled('c')) init_config.enable_language(ChurnConfig.Language.C) self.assertTrue(init_config.is_enabled('c'))
def _collect_cig_node_data( project_name: str, revision: FullCommitHash) -> tp.List[tp.Dict[str, tp.Any]]: churn_config = ChurnConfig.create_c_style_languages_config() cig = create_blame_interaction_graph(project_name, revision).commit_interaction_graph() commit_lookup = create_commit_lookup_helper(project_name) repo_lookup = get_local_project_gits(project_name) def filter_nodes(node: CommitRepoPair) -> bool: if node.commit_hash == UNCOMMITTED_COMMIT_HASH: return False return bool(commit_lookup(node)) nodes: tp.List[tp.Dict[str, tp.Any]] = [] for node in cig.nodes: node_attrs = tp.cast(CIGNodeAttrs, cig.nodes[node]) commit = node_attrs["commit"] if not filter_nodes(commit): continue _, insertions, _ = calc_commit_code_churn( Path(repo_lookup[commit.repository_name].path), commit.commit_hash, churn_config) if insertions == 0: LOG.warning(f"Churn for commit {commit} is 0.") insertions = 1 nodes.append(({ "commit_hash": commit.commit_hash.hash, "degree": cig.degree(node), "insertions": insertions, })) return nodes
def test_c_style_config(self): c_style_config = ChurnConfig.create_c_style_languages_config() self.assertTrue(c_style_config.is_enabled('h')) self.assertTrue(c_style_config.is_enabled('c')) self.assertTrue(c_style_config.is_enabled('hpp')) self.assertTrue(c_style_config.is_enabled('cpp')) self.assertTrue(c_style_config.is_enabled('hxx')) self.assertTrue(c_style_config.is_enabled('cxx'))
def create_graph() -> nx.DiGraph: repos = get_local_project_gits(self.project_name) interaction_graph = nx.DiGraph() churn_config = ChurnConfig.create_c_style_languages_config() file_pattern = re.compile(r"|".join( churn_config.get_extensions_repr(prefix=r"\.", suffix=r"$"))) blame_regex = re.compile( r"^([0-9a-f]+)\s+(?:.+\s+)?[\d]+\) ?(.*)$") for repo_name in repos: repo_path = get_local_project_git_path(self.project_name, repo_name) project_git = git["-C", str(repo_path)] head_commit = get_submodule_head(self.project_name, repo_name, self.__head_commit) file_names = project_git("ls-tree", "--full-tree", "--name-only", "-r", head_commit).split("\n") files: tp.List[Path] = [ repo_path / path for path in file_names if file_pattern.search(path) ] for file in files: nodes: tp.Set[BIGNodeTy] = set() blame_lines: str = project_git( "blame", "-w", "-s", "-l", "--root", head_commit, "--", str(file.relative_to(repo_path))) for line in blame_lines.strip().split("\n"): match = blame_regex.match(line) if not match: raise AssertionError if match.group(2): nodes.add( BlameTaintData( CommitRepoPair( FullCommitHash(match.group(1)), repo_name))) for node in nodes: interaction_graph.add_node(node, blame_taint_data=node) for commit_a, commit_b in itertools.product(nodes, repeat=2): if commit_a != commit_b: if not interaction_graph.has_edge( commit_a, commit_b): interaction_graph.add_edge(commit_a, commit_b, amount=0) interaction_graph[commit_a][commit_b][ "amount"] += 1 return interaction_graph
def plot(self, view_mode: bool) -> None: case_study = self.plot_kwargs["case_study"] style.use(self.plot_config.style()) fig, axes = plt.subplots(1, 1, sharey="all") fig.subplots_adjust(hspace=0.5) fig.suptitle("Central Code") axes.set_title(case_study.project_name) axes.set_ylabel("Code Centrality") axes.set_xlabel("Commits") project_name = case_study.project_name revision = newest_processed_revision_for_case_study( case_study, BlameReport) if not revision: raise PlotDataEmpty() churn_config = ChurnConfig.create_c_style_languages_config() cig = create_blame_interaction_graph( project_name, revision).commit_interaction_graph() commit_lookup = create_commit_lookup_helper(project_name) repo_lookup = get_local_project_gits(project_name) def filter_nodes(node: CommitRepoPair) -> bool: if node.commit_hash == UNCOMMITTED_COMMIT_HASH: return False return bool(commit_lookup(node)) nodes: tp.List[tp.Dict[str, tp.Any]] = [] for node in cig.nodes: node_attrs = tp.cast(CIGNodeAttrs, cig.nodes[node]) commit = node_attrs["commit"] if not filter_nodes(commit): continue _, insertions, _ = calc_commit_code_churn( Path(repo_lookup[commit.repository_name].path), commit.commit_hash, churn_config) if insertions == 0: LOG.warning(f"Churn for commit {commit} is 0.") insertions = 1 nodes.append(({ "commit_hash": commit.commit_hash, "degree": cig.degree(node), "insertions": insertions, })) data = pd.DataFrame(nodes) data["code_centrality"] = data["degree"] - data["insertions"] data.sort_values(by="code_centrality", inplace=True) centrality_scores = data.loc[:, ["commit_hash", "code_centrality"]] centrality_scores.sort_values(by="code_centrality", inplace=True) axes.plot(centrality_scores["code_centrality"].values) axes.set_ylim(bottom=0)
def plot(self, view_mode: bool) -> None: case_study = self.plot_kwargs["case_study"] project_name = case_study.project_name revision = newest_processed_revision_for_case_study( case_study, BlameReport) if not revision: raise PlotDataEmpty() cig = create_blame_interaction_graph( project_name, revision).commit_interaction_graph() commit_lookup = create_commit_lookup_helper(project_name) repo_lookup = get_local_project_gits(project_name) code_churn_lookup = { repo_name: calc_repo_code_churn(repo, ChurnConfig.create_c_style_languages_config()) for repo_name, repo in repo_lookup.items() } def filter_nodes(node: CommitRepoPair) -> bool: if node.commit_hash == UNCOMMITTED_COMMIT_HASH: return False return bool(commit_lookup(node)) nodes: tp.List[tp.Dict[str, tp.Any]] = [] for node in cig.nodes: node_attrs = tp.cast(CIGNodeAttrs, cig.nodes[node]) commit = node_attrs["commit"] if not filter_nodes(commit): continue _, insertions, _ = code_churn_lookup[commit.repository_name][ commit.commit_hash] nodes.append(({ "Case Study": project_name, "commit_hash": commit.commit_hash, "Commit Size": insertions, "Node Degree": cig.degree(node), })) data = pd.DataFrame(nodes) data = apply_tukeys_fence(data, "Commit Size", 3.0) grid = multivariate_grid("Commit Size", "Node Degree", "Case Study", data, global_kde=False) ax = grid.ax_joint ax.axvline(data["Commit Size"].quantile(0.20), color="#777777", linewidth=3) ax.axhline(data["Node Degree"].quantile(0.80), color="#777777", linewidth=3)
def test_one_commit_diff(self): """Check if we get the correct code churn for a single commit.""" repo_path = get_local_project_git_path("brotli") files_changed, insertions, deletions = calc_commit_code_churn( repo_path, FullCommitHash("0c5603e07bed1d5fbb45e38f9bdf0e4560fde3f0"), ChurnConfig.create_c_style_languages_config()) self.assertEqual(files_changed, 1) self.assertEqual(insertions, 2) self.assertEqual(deletions, 2)
def test_one_commit_diff_3(self): """Check if we get the correct code churn for a single commit.""" repo_path = get_local_project_git_path("brotli") files_changed, insertions, deletions = calc_commit_code_churn( repo_path, FullCommitHash("924b2b2b9dc54005edbcd85a1b872330948cdd9e"), ChurnConfig.create_c_style_languages_config()) self.assertEqual(files_changed, 3) self.assertEqual(insertions, 38) self.assertEqual(deletions, 7)
def test_one_commit_diff_2(self): """Check if we get the correct code churn for a single commit.""" repo_path = get_local_project_git_path("brotli") files_changed, insertions, deletions = calc_commit_code_churn( repo_path, FullCommitHash("fc823290a76a260b7ba6f47ab5f52064a0ce19ff"), ChurnConfig.create_c_style_languages_config()) self.assertEqual(files_changed, 1) self.assertEqual(insertions, 5) self.assertEqual(deletions, 0)
def build_revisions_churn_table( project_name: str, commit_map: CommitMap, revisions: tp.List[FullCommitHash]) -> pd.DataFrame: """ Build a pandas data frame that contains all churn related data for the given list of revisions. The churn is calculated as the diff between two successive revisions in the ``revisions`` list. Table layout: "revision", "time_id", "insertions", "deletions", "changed_files" Args: project_name: name of the project commit_map: CommitMap for the given project(by project_name) revisions: list of revisions used to calculate the churn data Returns: a data frame containing the churn data """ def create_dataframe_layout() -> pd.DataFrame: df_layout = pd.DataFrame(columns=[ "revision", "time_id", "insertions", "deletions", "changed_files" ]) df_layout.time_id = df_layout.time_id.astype('int32') df_layout.insertions = df_layout.insertions.astype('int64') df_layout.deletions = df_layout.deletions.astype('int64') df_layout.changed_files = df_layout.changed_files.astype('int64') return df_layout repo_path = get_local_project_git_path(project_name) revision_pairs = zip(*(islice(revisions, i, None) for i in range(2))) code_churn = [(0, 0, 0)] code_churn.extend([ calc_code_churn(repo_path, a, b, ChurnConfig.create_c_style_languages_config()) for a, b in revision_pairs ]) churn_data = pd.DataFrame({ "revision": revisions, "time_id": [commit_map.time_id(x) for x in revisions], "insertions": [x[1] for x in code_churn], "deletions": [x[2] for x in code_churn], "changed_files": [x[0] for x in code_churn] }) return pd.concat([create_dataframe_layout(), churn_data])
def test_commit_range(self): """Check if we get the correct code churn for commit range.""" repo_path = get_local_project_git_path("brotli") files_changed, insertions, deletions = calc_code_churn( repo_path, FullCommitHash("36ac0feaf9654855ee090b1f042363ecfb256f31"), FullCommitHash("924b2b2b9dc54005edbcd85a1b872330948cdd9e"), ChurnConfig.create_c_style_languages_config()) self.assertEqual(files_changed, 3) self.assertEqual(insertions, 49) self.assertEqual(deletions, 11)
def test_one_commit_diff_ignore_non_c_cpp_files(self): """Check if we get the correct code churn for a single commit but only consider code changes.""" repo_path = get_local_project_git_path("brotli") files_changed, insertions, deletions = calc_commit_code_churn( repo_path, FullCommitHash("f503cb709ca181dbf5c73986ebac1b18ac5c9f63"), ChurnConfig.create_c_style_languages_config()) self.assertEqual(files_changed, 1) self.assertEqual(insertions, 11) self.assertEqual(deletions, 4)
def test_start_with_initial_commit(self): """Check if the initial commit is handled correctly.""" repo_path = get_local_project_git_path("brotli") churn = calc_code_churn_range( repo_path, ChurnConfig.create_c_style_languages_config(), FullCommitHash("8f30907d0f2ef354c2b31bdee340c2b11dda0fb0"), FullCommitHash("8f30907d0f2ef354c2b31bdee340c2b11dda0fb0")) files_changed, insertions, deletions = churn[FullCommitHash( "8f30907d0f2ef354c2b31bdee340c2b11dda0fb0")] self.assertEqual(files_changed, 11) self.assertEqual(insertions, 1730) self.assertEqual(deletions, 0)
def extend_with_distrib_sampling(case_study: CaseStudy, cmap: CommitMap, sampling_method: NormalSamplingMethod, merge_stage: int, num_rev: int, ignore_blocked: bool, only_code_commits: bool) -> None: """ Extend a case study by sampling 'num_rev' new revisions, according to distribution specified with kwargs['distribution']. Args: case_study: to extend cmap: commit map to map revisions to unique IDs sampling_method: distribution to use for sampling merge_stage: stage the revisions will be added to num_rev: number of revisions to add ignore_blocked: ignore_blocked revisions only_code_commits: exclude commits which don't change code """ is_blocked: tp.Callable[[ShortCommitHash, tp.Type[Project]], bool] = lambda rev, _: False if ignore_blocked: is_blocked = is_revision_blocked is_code_commit: tp.Callable[[ShortCommitHash], bool] = lambda rev: True if only_code_commits: churn_conf = ChurnConfig.create_c_style_languages_config() project_git_path = get_local_project_git_path(case_study.project_name) def is_c_cpp_code_commit(commit: ShortCommitHash) -> bool: return contains_source_code(commit, project_git_path, churn_conf) is_code_commit = is_c_cpp_code_commit # Needs to be sorted so the propability distribution over the length # of the list is the same as the distribution over the commits age history project_cls = get_project_cls_by_name(case_study.project_name) revision_list = [ (FullCommitHash(rev), idx) for rev, idx in sorted(list(cmap.mapping_items()), key=lambda x: x[1]) if not case_study.has_revision_in_stage(ShortCommitHash(rev), merge_stage) and not is_blocked(ShortCommitHash(rev), project_cls) and is_code_commit(ShortCommitHash(rev)) ] case_study.include_revisions( sampling_method.sample_n(revision_list, num_rev), merge_stage)
def test_contains_source_code_without(self) -> None: """Check if we can correctly identify commits with source code.""" churn_conf = ChurnConfig.create_c_style_languages_config() project_git_path = get_local_project_git_path('brotli') self.assertFalse( contains_source_code( ShortCommitHash('f4153a09f87cbb9c826d8fc12c74642bb2d879ea'), project_git_path, churn_conf)) self.assertFalse( contains_source_code( ShortCommitHash('e83c7b8e8fb8b696a1df6866bc46cbb76d7e0348'), project_git_path, churn_conf)) self.assertFalse( contains_source_code( ShortCommitHash('698e3a7f9d3000fa44174f5be415bf713f71bd0e'), project_git_path, churn_conf))
def test_end_only(self): """Check if churn is correct if only end range is set.""" repo_path = get_local_project_git_path("brotli") churn = calc_code_churn_range( repo_path, ChurnConfig.create_c_style_languages_config(), None, FullCommitHash("645552217219c2877780ba4d7030044ec62d8255")) self.assertEqual( churn[FullCommitHash("645552217219c2877780ba4d7030044ec62d8255")], (2, 173, 145)) self.assertEqual( churn[FullCommitHash("e0346c826249368f0f4a68a2b95f4ab5cf1e235b")], (3, 51, 51)) self.assertEqual( churn[FullCommitHash("8f30907d0f2ef354c2b31bdee340c2b11dda0fb0")], (11, 1730, 0))
def test_contains_source_code_with(self) -> None: """Check if we can correctly identify commits without source code.""" churn_conf = ChurnConfig.create_c_style_languages_config() project_git_path = get_local_project_git_path('brotli') self.assertTrue( contains_source_code( ShortCommitHash('62662f87cdd96deda90ac817de94e3c4af75226a'), project_git_path, churn_conf)) self.assertTrue( contains_source_code( ShortCommitHash('27dd7265403d8e8fed99a854b9c3e1db7d79525f'), project_git_path, churn_conf)) # Merge commit of the previous one self.assertTrue( contains_source_code( ShortCommitHash('4ec67035c0d97c270c1c73038cc66fc5fcdfc120'), project_git_path, churn_conf))
def filter_non_code_changes(blame_data: pd.DataFrame, project_name: str) -> pd.DataFrame: """ Filter all revision from data frame that are not code change related. Args: blame_data: data to filter project_name: name of the project Returns: filtered data frame without rows related to non code changes """ repo = get_local_project_git(project_name) code_related_changes = [ x.hash for x in calc_repo_code_churn( repo, ChurnConfig.create_c_style_languages_config()) ] return blame_data[blame_data.apply( lambda x: x['revision'] in code_related_changes, axis=1)]
def build_repo_churn_table(project_name: str, commit_map: CommitMap) -> pd.DataFrame: """ Build a pandas data table that contains all churn related data for an repository. Table layout: "revision", "time_id", "insertions", "deletions", "changed_files" Args: project_name: name of the project commit_map: CommitMap for the given project(by project_name) """ def create_dataframe_layout() -> pd.DataFrame: df_layout = pd.DataFrame(columns=[ "revision", "time_id", "insertions", "deletions", "changed_files" ]) df_layout.time_id = df_layout.time_id.astype('int32') df_layout.insertions = df_layout.insertions.astype('int64') df_layout.deletions = df_layout.deletions.astype('int64') df_layout.changed_files = df_layout.changed_files.astype('int64') return df_layout repo_path = get_local_project_git_path(project_name) # By default we only look at c-style code files code_churn = calc_repo_code_churn( repo_path, ChurnConfig.create_c_style_languages_config()) churn_data = pd.DataFrame({ "revision": list(code_churn), "time_id": [commit_map.time_id(x) for x in code_churn], "insertions": [x[1] for x in code_churn.values()], "deletions": [x[2] for x in code_churn.values()], "changed_files": [x[0] for x in code_churn.values()] }) return pd.concat([create_dataframe_layout(), churn_data])
def create_data_frame_for_report( report_paths: tp.Tuple[Path, Path] ) -> tp.Tuple[pd.DataFrame, str, str]: # Look-up commit and infos about the HEAD commit of the report head_report = load_blame_report(report_paths[0]) pred_report = load_blame_report(report_paths[1]) commit = repo.get(head_report.head_commit.hash) commit_date = datetime.utcfromtimestamp(commit.commit_time) pred_commit = repo.get(pred_report.head_commit.hash) diff_between_head_pred = BlameReportDiff(head_report, pred_report) # Calculate the total churn between pred and base commit code_churn = calc_code_churn( Path(repo.path), FullCommitHash.from_pygit_commit(pred_commit), FullCommitHash.from_pygit_commit(commit), ChurnConfig.create_c_style_languages_config()) total_churn = code_churn[1] + code_churn[2] def weighted_avg(tuples: tp.List[tp.Tuple[int, int]]) -> float: total_sum = 0 degree_sum = 0 for degree, amount in tuples: degree_sum += degree total_sum += (degree * amount) return total_sum / max(1, degree_sum) def combine_max(tuples: tp.List[tp.Tuple[int, int]]) -> float: if tuples: return max([x for x, y in tuples]) return 0 return (pd.DataFrame( { 'revision': head_report.head_commit.hash, 'time_id': commit_map.short_time_id(head_report.head_commit), 'churn': total_churn, 'num_interactions': count_interactions(diff_between_head_pred), 'num_interacting_commits': count_interacting_commits(diff_between_head_pred), 'num_interacting_authors': count_interacting_authors(diff_between_head_pred, commit_lookup), "ci_degree_mean": weighted_avg( generate_degree_tuples(diff_between_head_pred)), "author_mean": weighted_avg( generate_author_degree_tuples(diff_between_head_pred, commit_lookup)), "avg_time_mean": weighted_avg( generate_avg_time_distribution_tuples( diff_between_head_pred, commit_lookup, 1)), "ci_degree_max": combine_max( generate_degree_tuples(diff_between_head_pred)), "author_max": combine_max( generate_author_degree_tuples(diff_between_head_pred, commit_lookup)), "avg_time_max": combine_max( generate_max_time_distribution_tuples( diff_between_head_pred, commit_lookup, 1)), 'year': commit_date.year, }, index=[0]), id_from_paths(report_paths), timestamp_from_paths(report_paths))
def test_enabled_language(self): c_config = ChurnConfig.create_c_language_config() self.assertTrue(c_config.is_language_enabled(ChurnConfig.Language.C)) self.assertFalse(c_config.is_language_enabled( ChurnConfig.Language.CPP))
def test_initial_config(self): init_config = ChurnConfig.create_default_config() self.assertTrue(init_config.include_everything) self.assertListEqual(init_config.enabled_languages, [])