def test_extensions_repr_gen(self):
        c_config = ChurnConfig.create_c_language_config()
        self.assertEqual(c_config.get_extensions_repr(), ["c", "h"])
        self.assertEqual(c_config.get_extensions_repr(prefix="*."),
                         ["*.c", "*.h"])
        self.assertEqual(c_config.get_extensions_repr(suffix="|"),
                         ["c|", "h|"])

        c_style_config = ChurnConfig.create_c_style_languages_config()
        self.assertEqual(c_style_config.get_extensions_repr(),
                         ["c", "cpp", "cxx", "h", "hpp", "hxx"])
        self.assertEqual(c_style_config.get_extensions_repr(prefix="*."),
                         ["*.c", "*.cpp", "*.cxx", "*.h", "*.hpp", "*.hxx"])
        self.assertEqual(c_style_config.get_extensions_repr(suffix="|"),
                         ["c|", "cpp|", "cxx|", "h|", "hpp|", "hxx|"])
 def test_enable_language(self):
     init_config = ChurnConfig.create_default_config()
     self.assertFalse(init_config.is_enabled('c'))
     init_config.enable_language(ChurnConfig.Language.CPP)
     self.assertFalse(init_config.is_enabled('c'))
     init_config.enable_language(ChurnConfig.Language.C)
     self.assertTrue(init_config.is_enabled('c'))
def _collect_cig_node_data(
        project_name: str,
        revision: FullCommitHash) -> tp.List[tp.Dict[str, tp.Any]]:
    churn_config = ChurnConfig.create_c_style_languages_config()
    cig = create_blame_interaction_graph(project_name,
                                         revision).commit_interaction_graph()
    commit_lookup = create_commit_lookup_helper(project_name)
    repo_lookup = get_local_project_gits(project_name)

    def filter_nodes(node: CommitRepoPair) -> bool:
        if node.commit_hash == UNCOMMITTED_COMMIT_HASH:
            return False
        return bool(commit_lookup(node))

    nodes: tp.List[tp.Dict[str, tp.Any]] = []
    for node in cig.nodes:
        node_attrs = tp.cast(CIGNodeAttrs, cig.nodes[node])
        commit = node_attrs["commit"]
        if not filter_nodes(commit):
            continue
        _, insertions, _ = calc_commit_code_churn(
            Path(repo_lookup[commit.repository_name].path), commit.commit_hash,
            churn_config)
        if insertions == 0:
            LOG.warning(f"Churn for commit {commit} is 0.")
            insertions = 1
        nodes.append(({
            "commit_hash": commit.commit_hash.hash,
            "degree": cig.degree(node),
            "insertions": insertions,
        }))
    return nodes
 def test_c_style_config(self):
     c_style_config = ChurnConfig.create_c_style_languages_config()
     self.assertTrue(c_style_config.is_enabled('h'))
     self.assertTrue(c_style_config.is_enabled('c'))
     self.assertTrue(c_style_config.is_enabled('hpp'))
     self.assertTrue(c_style_config.is_enabled('cpp'))
     self.assertTrue(c_style_config.is_enabled('hxx'))
     self.assertTrue(c_style_config.is_enabled('cxx'))
Exemple #5
0
        def create_graph() -> nx.DiGraph:
            repos = get_local_project_gits(self.project_name)
            interaction_graph = nx.DiGraph()
            churn_config = ChurnConfig.create_c_style_languages_config()
            file_pattern = re.compile(r"|".join(
                churn_config.get_extensions_repr(prefix=r"\.", suffix=r"$")))

            blame_regex = re.compile(
                r"^([0-9a-f]+)\s+(?:.+\s+)?[\d]+\) ?(.*)$")

            for repo_name in repos:
                repo_path = get_local_project_git_path(self.project_name,
                                                       repo_name)
                project_git = git["-C", str(repo_path)]
                head_commit = get_submodule_head(self.project_name, repo_name,
                                                 self.__head_commit)

                file_names = project_git("ls-tree", "--full-tree",
                                         "--name-only", "-r",
                                         head_commit).split("\n")
                files: tp.List[Path] = [
                    repo_path / path for path in file_names
                    if file_pattern.search(path)
                ]
                for file in files:
                    nodes: tp.Set[BIGNodeTy] = set()
                    blame_lines: str = project_git(
                        "blame", "-w", "-s", "-l", "--root", head_commit, "--",
                        str(file.relative_to(repo_path)))

                    for line in blame_lines.strip().split("\n"):
                        match = blame_regex.match(line)
                        if not match:
                            raise AssertionError

                        if match.group(2):
                            nodes.add(
                                BlameTaintData(
                                    CommitRepoPair(
                                        FullCommitHash(match.group(1)),
                                        repo_name)))

                    for node in nodes:
                        interaction_graph.add_node(node, blame_taint_data=node)
                    for commit_a, commit_b in itertools.product(nodes,
                                                                repeat=2):
                        if commit_a != commit_b:
                            if not interaction_graph.has_edge(
                                    commit_a, commit_b):
                                interaction_graph.add_edge(commit_a,
                                                           commit_b,
                                                           amount=0)
                            interaction_graph[commit_a][commit_b][
                                "amount"] += 1
            return interaction_graph
Exemple #6
0
    def plot(self, view_mode: bool) -> None:
        case_study = self.plot_kwargs["case_study"]

        style.use(self.plot_config.style())
        fig, axes = plt.subplots(1, 1, sharey="all")
        fig.subplots_adjust(hspace=0.5)

        fig.suptitle("Central Code")
        axes.set_title(case_study.project_name)
        axes.set_ylabel("Code Centrality")
        axes.set_xlabel("Commits")

        project_name = case_study.project_name
        revision = newest_processed_revision_for_case_study(
            case_study, BlameReport)
        if not revision:
            raise PlotDataEmpty()

        churn_config = ChurnConfig.create_c_style_languages_config()
        cig = create_blame_interaction_graph(
            project_name, revision).commit_interaction_graph()
        commit_lookup = create_commit_lookup_helper(project_name)
        repo_lookup = get_local_project_gits(project_name)

        def filter_nodes(node: CommitRepoPair) -> bool:
            if node.commit_hash == UNCOMMITTED_COMMIT_HASH:
                return False
            return bool(commit_lookup(node))

        nodes: tp.List[tp.Dict[str, tp.Any]] = []
        for node in cig.nodes:
            node_attrs = tp.cast(CIGNodeAttrs, cig.nodes[node])
            commit = node_attrs["commit"]
            if not filter_nodes(commit):
                continue
            _, insertions, _ = calc_commit_code_churn(
                Path(repo_lookup[commit.repository_name].path),
                commit.commit_hash, churn_config)
            if insertions == 0:
                LOG.warning(f"Churn for commit {commit} is 0.")
                insertions = 1
            nodes.append(({
                "commit_hash": commit.commit_hash,
                "degree": cig.degree(node),
                "insertions": insertions,
            }))

        data = pd.DataFrame(nodes)
        data["code_centrality"] = data["degree"] - data["insertions"]
        data.sort_values(by="code_centrality", inplace=True)
        centrality_scores = data.loc[:, ["commit_hash", "code_centrality"]]
        centrality_scores.sort_values(by="code_centrality", inplace=True)
        axes.plot(centrality_scores["code_centrality"].values)
        axes.set_ylim(bottom=0)
    def plot(self, view_mode: bool) -> None:
        case_study = self.plot_kwargs["case_study"]
        project_name = case_study.project_name
        revision = newest_processed_revision_for_case_study(
            case_study, BlameReport)
        if not revision:
            raise PlotDataEmpty()

        cig = create_blame_interaction_graph(
            project_name, revision).commit_interaction_graph()

        commit_lookup = create_commit_lookup_helper(project_name)
        repo_lookup = get_local_project_gits(project_name)
        code_churn_lookup = {
            repo_name:
            calc_repo_code_churn(repo,
                                 ChurnConfig.create_c_style_languages_config())
            for repo_name, repo in repo_lookup.items()
        }

        def filter_nodes(node: CommitRepoPair) -> bool:
            if node.commit_hash == UNCOMMITTED_COMMIT_HASH:
                return False
            return bool(commit_lookup(node))

        nodes: tp.List[tp.Dict[str, tp.Any]] = []
        for node in cig.nodes:
            node_attrs = tp.cast(CIGNodeAttrs, cig.nodes[node])
            commit = node_attrs["commit"]
            if not filter_nodes(commit):
                continue
            _, insertions, _ = code_churn_lookup[commit.repository_name][
                commit.commit_hash]
            nodes.append(({
                "Case Study": project_name,
                "commit_hash": commit.commit_hash,
                "Commit Size": insertions,
                "Node Degree": cig.degree(node),
            }))
        data = pd.DataFrame(nodes)
        data = apply_tukeys_fence(data, "Commit Size", 3.0)
        grid = multivariate_grid("Commit Size",
                                 "Node Degree",
                                 "Case Study",
                                 data,
                                 global_kde=False)

        ax = grid.ax_joint
        ax.axvline(data["Commit Size"].quantile(0.20),
                   color="#777777",
                   linewidth=3)
        ax.axhline(data["Node Degree"].quantile(0.80),
                   color="#777777",
                   linewidth=3)
    def test_one_commit_diff(self):
        """Check if we get the correct code churn for a single commit."""

        repo_path = get_local_project_git_path("brotli")

        files_changed, insertions, deletions = calc_commit_code_churn(
            repo_path,
            FullCommitHash("0c5603e07bed1d5fbb45e38f9bdf0e4560fde3f0"),
            ChurnConfig.create_c_style_languages_config())

        self.assertEqual(files_changed, 1)
        self.assertEqual(insertions, 2)
        self.assertEqual(deletions, 2)
    def test_one_commit_diff_3(self):
        """Check if we get the correct code churn for a single commit."""

        repo_path = get_local_project_git_path("brotli")

        files_changed, insertions, deletions = calc_commit_code_churn(
            repo_path,
            FullCommitHash("924b2b2b9dc54005edbcd85a1b872330948cdd9e"),
            ChurnConfig.create_c_style_languages_config())

        self.assertEqual(files_changed, 3)
        self.assertEqual(insertions, 38)
        self.assertEqual(deletions, 7)
    def test_one_commit_diff_2(self):
        """Check if we get the correct code churn for a single commit."""

        repo_path = get_local_project_git_path("brotli")

        files_changed, insertions, deletions = calc_commit_code_churn(
            repo_path,
            FullCommitHash("fc823290a76a260b7ba6f47ab5f52064a0ce19ff"),
            ChurnConfig.create_c_style_languages_config())

        self.assertEqual(files_changed, 1)
        self.assertEqual(insertions, 5)
        self.assertEqual(deletions, 0)
def build_revisions_churn_table(
        project_name: str, commit_map: CommitMap,
        revisions: tp.List[FullCommitHash]) -> pd.DataFrame:
    """
    Build a pandas data frame that contains all churn related data for the given
    list of revisions.

    The churn is calculated as the diff between two successive revisions in
    the ``revisions`` list.

    Table layout:
            "revision", "time_id", "insertions", "deletions", "changed_files"

    Args:
        project_name: name of the project
        commit_map: CommitMap for the given project(by project_name)
        revisions: list of revisions used to calculate the churn data

    Returns:
        a data frame containing the churn data
    """
    def create_dataframe_layout() -> pd.DataFrame:
        df_layout = pd.DataFrame(columns=[
            "revision", "time_id", "insertions", "deletions", "changed_files"
        ])
        df_layout.time_id = df_layout.time_id.astype('int32')
        df_layout.insertions = df_layout.insertions.astype('int64')
        df_layout.deletions = df_layout.deletions.astype('int64')
        df_layout.changed_files = df_layout.changed_files.astype('int64')
        return df_layout

    repo_path = get_local_project_git_path(project_name)

    revision_pairs = zip(*(islice(revisions, i, None) for i in range(2)))
    code_churn = [(0, 0, 0)]
    code_churn.extend([
        calc_code_churn(repo_path, a, b,
                        ChurnConfig.create_c_style_languages_config())
        for a, b in revision_pairs
    ])
    churn_data = pd.DataFrame({
        "revision":
        revisions,
        "time_id": [commit_map.time_id(x) for x in revisions],
        "insertions": [x[1] for x in code_churn],
        "deletions": [x[2] for x in code_churn],
        "changed_files": [x[0] for x in code_churn]
    })

    return pd.concat([create_dataframe_layout(), churn_data])
    def test_commit_range(self):
        """Check if we get the correct code churn for commit range."""

        repo_path = get_local_project_git_path("brotli")

        files_changed, insertions, deletions = calc_code_churn(
            repo_path,
            FullCommitHash("36ac0feaf9654855ee090b1f042363ecfb256f31"),
            FullCommitHash("924b2b2b9dc54005edbcd85a1b872330948cdd9e"),
            ChurnConfig.create_c_style_languages_config())

        self.assertEqual(files_changed, 3)
        self.assertEqual(insertions, 49)
        self.assertEqual(deletions, 11)
    def test_one_commit_diff_ignore_non_c_cpp_files(self):
        """Check if we get the correct code churn for a single commit but only
        consider code changes."""

        repo_path = get_local_project_git_path("brotli")

        files_changed, insertions, deletions = calc_commit_code_churn(
            repo_path,
            FullCommitHash("f503cb709ca181dbf5c73986ebac1b18ac5c9f63"),
            ChurnConfig.create_c_style_languages_config())

        self.assertEqual(files_changed, 1)
        self.assertEqual(insertions, 11)
        self.assertEqual(deletions, 4)
    def test_start_with_initial_commit(self):
        """Check if the initial commit is handled correctly."""

        repo_path = get_local_project_git_path("brotli")

        churn = calc_code_churn_range(
            repo_path, ChurnConfig.create_c_style_languages_config(),
            FullCommitHash("8f30907d0f2ef354c2b31bdee340c2b11dda0fb0"),
            FullCommitHash("8f30907d0f2ef354c2b31bdee340c2b11dda0fb0"))

        files_changed, insertions, deletions = churn[FullCommitHash(
            "8f30907d0f2ef354c2b31bdee340c2b11dda0fb0")]
        self.assertEqual(files_changed, 11)
        self.assertEqual(insertions, 1730)
        self.assertEqual(deletions, 0)
Exemple #15
0
def extend_with_distrib_sampling(case_study: CaseStudy, cmap: CommitMap,
                                 sampling_method: NormalSamplingMethod,
                                 merge_stage: int, num_rev: int,
                                 ignore_blocked: bool,
                                 only_code_commits: bool) -> None:
    """
    Extend a case study by sampling 'num_rev' new revisions, according to
    distribution specified with kwargs['distribution'].

    Args:
        case_study: to extend
        cmap: commit map to map revisions to unique IDs
        sampling_method: distribution to use for sampling
        merge_stage: stage the revisions will be added to
        num_rev: number of revisions to add
        ignore_blocked: ignore_blocked revisions
        only_code_commits: exclude commits which don't change code
    """
    is_blocked: tp.Callable[[ShortCommitHash, tp.Type[Project]],
                            bool] = lambda rev, _: False
    if ignore_blocked:
        is_blocked = is_revision_blocked

    is_code_commit: tp.Callable[[ShortCommitHash], bool] = lambda rev: True
    if only_code_commits:
        churn_conf = ChurnConfig.create_c_style_languages_config()
        project_git_path = get_local_project_git_path(case_study.project_name)

        def is_c_cpp_code_commit(commit: ShortCommitHash) -> bool:
            return contains_source_code(commit, project_git_path, churn_conf)

        is_code_commit = is_c_cpp_code_commit

    # Needs to be sorted so the propability distribution over the length
    # of the list is the same as the distribution over the commits age history
    project_cls = get_project_cls_by_name(case_study.project_name)
    revision_list = [
        (FullCommitHash(rev), idx)
        for rev, idx in sorted(list(cmap.mapping_items()), key=lambda x: x[1])
        if
        not case_study.has_revision_in_stage(ShortCommitHash(rev), merge_stage)
        and not is_blocked(ShortCommitHash(rev), project_cls)
        and is_code_commit(ShortCommitHash(rev))
    ]

    case_study.include_revisions(
        sampling_method.sample_n(revision_list, num_rev), merge_stage)
    def test_contains_source_code_without(self) -> None:
        """Check if we can correctly identify commits with source code."""
        churn_conf = ChurnConfig.create_c_style_languages_config()
        project_git_path = get_local_project_git_path('brotli')

        self.assertFalse(
            contains_source_code(
                ShortCommitHash('f4153a09f87cbb9c826d8fc12c74642bb2d879ea'),
                project_git_path, churn_conf))
        self.assertFalse(
            contains_source_code(
                ShortCommitHash('e83c7b8e8fb8b696a1df6866bc46cbb76d7e0348'),
                project_git_path, churn_conf))
        self.assertFalse(
            contains_source_code(
                ShortCommitHash('698e3a7f9d3000fa44174f5be415bf713f71bd0e'),
                project_git_path, churn_conf))
    def test_end_only(self):
        """Check if churn is correct if only end range is set."""

        repo_path = get_local_project_git_path("brotli")

        churn = calc_code_churn_range(
            repo_path, ChurnConfig.create_c_style_languages_config(), None,
            FullCommitHash("645552217219c2877780ba4d7030044ec62d8255"))

        self.assertEqual(
            churn[FullCommitHash("645552217219c2877780ba4d7030044ec62d8255")],
            (2, 173, 145))
        self.assertEqual(
            churn[FullCommitHash("e0346c826249368f0f4a68a2b95f4ab5cf1e235b")],
            (3, 51, 51))
        self.assertEqual(
            churn[FullCommitHash("8f30907d0f2ef354c2b31bdee340c2b11dda0fb0")],
            (11, 1730, 0))
    def test_contains_source_code_with(self) -> None:
        """Check if we can correctly identify commits without source code."""
        churn_conf = ChurnConfig.create_c_style_languages_config()
        project_git_path = get_local_project_git_path('brotli')

        self.assertTrue(
            contains_source_code(
                ShortCommitHash('62662f87cdd96deda90ac817de94e3c4af75226a'),
                project_git_path, churn_conf))
        self.assertTrue(
            contains_source_code(
                ShortCommitHash('27dd7265403d8e8fed99a854b9c3e1db7d79525f'),
                project_git_path, churn_conf))
        # Merge commit of the previous one
        self.assertTrue(
            contains_source_code(
                ShortCommitHash('4ec67035c0d97c270c1c73038cc66fc5fcdfc120'),
                project_git_path, churn_conf))
Exemple #19
0
def filter_non_code_changes(blame_data: pd.DataFrame,
                            project_name: str) -> pd.DataFrame:
    """
    Filter all revision from data frame that are not code change related.

    Args:
        blame_data: data to filter
        project_name: name of the project

    Returns:
        filtered data frame without rows related to non code changes
    """
    repo = get_local_project_git(project_name)
    code_related_changes = [
        x.hash for x in calc_repo_code_churn(
            repo, ChurnConfig.create_c_style_languages_config())
    ]
    return blame_data[blame_data.apply(
        lambda x: x['revision'] in code_related_changes, axis=1)]
def build_repo_churn_table(project_name: str,
                           commit_map: CommitMap) -> pd.DataFrame:
    """
    Build a pandas data table that contains all churn related data for an
    repository.

    Table layout:
            "revision", "time_id", "insertions", "deletions", "changed_files"

    Args:
        project_name: name of the project
        commit_map: CommitMap for the given project(by project_name)
    """
    def create_dataframe_layout() -> pd.DataFrame:
        df_layout = pd.DataFrame(columns=[
            "revision", "time_id", "insertions", "deletions", "changed_files"
        ])
        df_layout.time_id = df_layout.time_id.astype('int32')
        df_layout.insertions = df_layout.insertions.astype('int64')
        df_layout.deletions = df_layout.deletions.astype('int64')
        df_layout.changed_files = df_layout.changed_files.astype('int64')
        return df_layout

    repo_path = get_local_project_git_path(project_name)
    # By default we only look at c-style code files
    code_churn = calc_repo_code_churn(
        repo_path, ChurnConfig.create_c_style_languages_config())
    churn_data = pd.DataFrame({
        "revision":
        list(code_churn),
        "time_id": [commit_map.time_id(x) for x in code_churn],
        "insertions": [x[1] for x in code_churn.values()],
        "deletions": [x[2] for x in code_churn.values()],
        "changed_files": [x[0] for x in code_churn.values()]
    })

    return pd.concat([create_dataframe_layout(), churn_data])
        def create_data_frame_for_report(
            report_paths: tp.Tuple[Path, Path]
        ) -> tp.Tuple[pd.DataFrame, str, str]:
            # Look-up commit and infos about the HEAD commit of the report
            head_report = load_blame_report(report_paths[0])
            pred_report = load_blame_report(report_paths[1])
            commit = repo.get(head_report.head_commit.hash)
            commit_date = datetime.utcfromtimestamp(commit.commit_time)
            pred_commit = repo.get(pred_report.head_commit.hash)

            diff_between_head_pred = BlameReportDiff(head_report, pred_report)

            # Calculate the total churn between pred and base commit
            code_churn = calc_code_churn(
                Path(repo.path), FullCommitHash.from_pygit_commit(pred_commit),
                FullCommitHash.from_pygit_commit(commit),
                ChurnConfig.create_c_style_languages_config())
            total_churn = code_churn[1] + code_churn[2]

            def weighted_avg(tuples: tp.List[tp.Tuple[int, int]]) -> float:
                total_sum = 0
                degree_sum = 0
                for degree, amount in tuples:
                    degree_sum += degree
                    total_sum += (degree * amount)

                return total_sum / max(1, degree_sum)

            def combine_max(tuples: tp.List[tp.Tuple[int, int]]) -> float:
                if tuples:
                    return max([x for x, y in tuples])
                return 0

            return (pd.DataFrame(
                {
                    'revision':
                    head_report.head_commit.hash,
                    'time_id':
                    commit_map.short_time_id(head_report.head_commit),
                    'churn':
                    total_churn,
                    'num_interactions':
                    count_interactions(diff_between_head_pred),
                    'num_interacting_commits':
                    count_interacting_commits(diff_between_head_pred),
                    'num_interacting_authors':
                    count_interacting_authors(diff_between_head_pred,
                                              commit_lookup),
                    "ci_degree_mean":
                    weighted_avg(
                        generate_degree_tuples(diff_between_head_pred)),
                    "author_mean":
                    weighted_avg(
                        generate_author_degree_tuples(diff_between_head_pred,
                                                      commit_lookup)),
                    "avg_time_mean":
                    weighted_avg(
                        generate_avg_time_distribution_tuples(
                            diff_between_head_pred, commit_lookup, 1)),
                    "ci_degree_max":
                    combine_max(
                        generate_degree_tuples(diff_between_head_pred)),
                    "author_max":
                    combine_max(
                        generate_author_degree_tuples(diff_between_head_pred,
                                                      commit_lookup)),
                    "avg_time_max":
                    combine_max(
                        generate_max_time_distribution_tuples(
                            diff_between_head_pred, commit_lookup, 1)),
                    'year':
                    commit_date.year,
                },
                index=[0]), id_from_paths(report_paths),
                    timestamp_from_paths(report_paths))
 def test_enabled_language(self):
     c_config = ChurnConfig.create_c_language_config()
     self.assertTrue(c_config.is_language_enabled(ChurnConfig.Language.C))
     self.assertFalse(c_config.is_language_enabled(
         ChurnConfig.Language.CPP))
 def test_initial_config(self):
     init_config = ChurnConfig.create_default_config()
     self.assertTrue(init_config.include_everything)
     self.assertListEqual(init_config.enabled_languages, [])