Esempio n. 1
0
    def __init__(self, path_to_repo: str, clone_repo_to: str = None, at: str = 'release'):
        """ The class constructor.

        Parameters
        ----------
        path_to_repo : str
            The path to a local or remote repository.

        clone_repo_to : str
            Path to clone the repository to.
            If path_to_repo links to a local repository, this parameter is not used. Otherwise it is mandatory.

        at : str
            When to extract metrics: at each release or each commit.

        Attributes
        ----------
        dataset: pandas.DataFrame
            The metrics dataset, populated after ``extract()``.

        Raises
        ------
        ValueError
            If `at` is not release or commit, or if the path to the remote repository does not link to a github or
            gitlab repository.
        NotImplementedError
            The commit option is not implemented yet.

        """

        if at not in ('release', 'commit'):
            raise ValueError(f'{at} is not valid! Use \'release\' or \'commit\'.')

        self.path_to_repo = path_to_repo

        if is_remote(path_to_repo):

            if not clone_repo_to:
                raise ValueError('clone_repo_to is mandatory when linking to a remote repository.')

            full_name_pattern = re.compile(r'git(hub|lab)\.com/([\w\W]+)$')
            match = full_name_pattern.search(path_to_repo.replace('.git', ''))

            if not match:
                raise ValueError('The remote repository must be hosted on github or gitlab.')

            repo_name = match.groups()[1].split('/')[1]
            self.path_to_repo = os.path.join(clone_repo_to, repo_name)

            if os.path.isdir(self.path_to_repo):
                clone_repo_to = None

        repo_miner = Repository(path_to_repo=path_to_repo,
                                clone_repo_to=clone_repo_to,
                                only_releases=True if at == 'release' else False,
                                order='date-order', num_workers=1)

        self.commits_at = [commit.hash for commit in repo_miner.traverse_commits()]
        self.dataset = pd.DataFrame()
Esempio n. 2
0
    def get_fixed_files(self) -> None:
        """
        Populate the list of FixedFile objects.

        A FixedFile is a file modified in a bug-fixing commit that consists of a filename, hash of the commit that fixed
        it, and hash of the commit that introduced the bug.

        It uses the SZZ algorithm implemented in PyDriller to identify the oldest commit that introduced the bug,
        referred to as bug-introducing commit.

        `Note:` before calling this method, it is necessary that you run at least one between
        `get_fixing_commits_from_closed_issues` and `get_fixing_commits_from_commit_messages`.


        Returns
        -------
        None

        """

        if not self.fixing_commits:
            return

        self.sort_commits(self.fixing_commits)

        self.fixed_files = list()
        renamed_files = dict()
        git_repo = Git(self.path_to_repo)

        if len(self.fixing_commits) == 1:
            repository_mining = Repository(self.path_to_repo,
                                           single=self.fixing_commits[0],
                                           only_in_branch=self.branch,
                                           num_workers=1)
        else:
            repository_mining = Repository(
                self.path_to_repo,
                from_commit=self.
                fixing_commits[-1],  # Last fixing-commit by date
                to_commit=self.
                fixing_commits[0],  # First fixing-commit by date
                order='reverse',
                only_in_branch=self.branch,
                num_workers=1)

        # Traverse commits from the latest to the first fixing-commit
        for commit in repository_mining.traverse_commits():

            for modified_file in commit.modified_files:

                # Not interested in ADDED and DELETED files
                if modified_file.change_type not in (ModificationType.MODIFY,
                                                     ModificationType.RENAME):
                    continue

                # If RENAMED then handle renaming
                if modified_file.change_type == ModificationType.RENAME:
                    # if modified_file.new_path in renamed_files:
                    #     renamed_files[modified_file.old_path] = renamed_files[modified_file.new_path]
                    # else:
                    renamed_files[modified_file.old_path] = renamed_files.get(
                        modified_file.new_path, modified_file.new_path)
                    # elif commit.hash in self.fixing_commits:
                    #     renamed_files[modified_file.old_path] = modified_file.new_path

                # This is to ensure that renamed files are tracked. Then, if the commit is not a fixing-commit then
                # go to the next (previous commit in chronological order)
                if commit.hash not in self.fixing_commits:
                    continue

                # Not interested in type of files
                if self.ignore_file(modified_file.new_path,
                                    modified_file.source_code):
                    continue

                # Identify bug-inducing commits. Dict[modified_file, Set[commit_hashes]]
                bug_inducing_commits = git_repo.get_commits_last_modified_lines(
                    commit, modified_file)

                if not bug_inducing_commits.get(modified_file.new_path):
                    continue
                else:
                    bug_inducing_commits = list(
                        bug_inducing_commits[modified_file.new_path])
                    self.sort_commits(bug_inducing_commits)
                    bic = bug_inducing_commits[
                        0]  # bic is the oldest bug-inducing-commit

                current_fix = FixedFile(filepath=renamed_files.get(
                    modified_file.new_path, modified_file.new_path),
                                        bic=bic,
                                        fic=commit.hash)

                if current_fix not in self.fixed_files:
                    self.fixed_files.append(current_fix)
                else:
                    idx = self.fixed_files.index(current_fix)
                    existing_fix = self.fixed_files[idx]

                    # If the current FIC is older than the existing bic, then save it as a new FixedFile.
                    # Else it means the current fix is between the existing fix bic and fic.
                    # If the current BIC is older than the existing bic, then update the bic.
                    if self.commit_hashes.index(
                            current_fix.fic) < self.commit_hashes.index(
                                existing_fix.bic):

                        if modified_file.new_path in renamed_files:
                            del renamed_files[modified_file.new_path]

                        current_fix.filepath = modified_file.new_path
                        self.fixed_files.append(current_fix)
                    elif self.commit_hashes.index(
                            current_fix.bic) < self.commit_hashes.index(
                                existing_fix.bic):
                        existing_fix.bic = current_fix.bic