Ejemplo n.º 1
0
    def _get_commit_refs(self, repo_url, local_path, from_tag, to_tag):
        """
        Scan all commits between the two tags [`tag_start` .. `tag_end`]
        Extract any text from the commit message showing a github tag reference `#{number}`
        and return a list of ints

        :param repo_url: GitHub URL, used for finding issues/Pull requests
        :type  repo_url: str

        :param local_path: (Optional) path to scan a local repository and cross reference with GitHub
        :type  local_path: Path

        :param from_tag: Git Start Tag
        :type  from_tag: str

        :param to_tag: Git end tag
        :type  to_tag: str

        :return: Github rife references
        :rtype:  List of ints
        """
        self.logger.info("Fetching commits between tags {}...{} ".format(from_tag, to_tag))

        if local_path:
            repo = RepositoryMining(local_path, from_tag=from_tag, to_tag=to_tag)
        else:
            repo = RepositoryMining(repo_url, from_tag=from_tag, to_tag=to_tag)

        commit_list = [re.findall(r'#\d+', commit.msg) for commit in repo.traverse_commits()]
        commit_list = sum(commit_list, [])
        return set(map(lambda cm: int(cm[1:]), commit_list))
Ejemplo n.º 2
0
def fetch_keyword_introduce(repo, keyword):

    conditional_tag = re.compile(r'\+@Conditional.*\(')

    conds = dict()

    repo = RepositoryMining(repo, only_modifications_with_file_types=['.java'])

    commits = repo.traverse_commits()

    search = keyword


    def process(data):
        try:
            for m in data.modifications:
                matches = re.findall(conditional_tag, m.diff)
                for e in matches:
                    print(e[2:len(e)-1], data.committer_date, sep=" ; ")

        except TypeError:
            # print("WARNING cannot analyse commit : ", commit.hash)
            pass

    for commit in commits:
        t = Thread(target=process, args=(commit,))
        t.start()
Ejemplo n.º 3
0
def collect_data(path_to_repo, commit_shas):
    rm = RepositoryMining(path_to_repo, only_commits=commit_shas)
    commits = []
    modifications = []
    for commit in rm.traverse_commits():
        commits.append(commit)
        for mod in commit.modifications:
            modifications.append((commit, mod))

    return commits, modifications
Ejemplo n.º 4
0
def run_worker(i: int, repos: List[Path]) -> None:
    idx = 0

    for repo_path in repos:
        commit_count = 0
        start_time = time.time()

        # noinspection PyBroadException
        try:
            repo_mining = RepositoryMining(
                str(repo_path), only_modifications_with_file_types=['.py'])

            Path(f'./result{i}/').mkdir(exist_ok=True)

            with open(f'./result{i}/{repo_path.name}.stats.csv',
                      'w',
                      newline='') as csv_file:

                writer = csv.writer(csv_file, quoting=csv.QUOTE_MINIMAL)
                writer.writerow(COLUMNS)

                for commit in repo_mining.traverse_commits():
                    commit_count += 1

                    test_file_modifications = [
                        m for m in commit.modifications if 'test' in m.filename
                    ]

                    if len(test_file_modifications) == 0:
                        continue

                    for modification in test_file_modifications:
                        writer.writerow((commit.project_name, commit.msg,
                                         modification.old_path,
                                         modification.new_path, commit.hash))

        except Exception:
            tb = traceback.format_exc()

            with open(f'error_log{i}.txt', 'a') as fp:
                fp.write(f'repo_path={repo_path}\n')
                fp.write(f'{tb}\n\n')

        finally:
            used_time = time.time() - start_time

            with open(f'progress_log{i}.txt', 'a') as fp:
                fp.write(f'idx={idx} (out of {len(repos)})\n')
                fp.write(f'repo_path={repo_path}\n')
                fp.write(f'commit_count={commit_count}\n')
                fp.write(f'used_time={used_time} seconds\n')
                fp.write(f'timestamp={datetime.datetime.now()}\n\n')

            idx += 1
Ejemplo n.º 5
0
    def mine(self, **kwargs: Any) -> None:
        """Gather data from repository. To be extended in subclasses."""
        miner = RepositoryMining(self.repo, **kwargs)

        for commit in miner.traverse_commits():
            for m in commit.modifications:
                m.committer = commit.committer
                m.committer_date = commit.committer_date
                m.msg = commit.msg

                if self.include(m):
                    self.update_stats(m)
Ejemplo n.º 6
0
def parse_by_date(repo, bgn, end):
    lines = []

    miner = RepositoryMining(repo, since=bgn, to=end)
    for commit in miner.traverse_commits():
        for modification in commit.modifications:
            added = modification.diff_parsed['added']
            for item in added:
                line = item[1]
                if should_ignore(line):
                    continue
                lines.append(line)

    return lines
def test_commits_pickling():
    repo_path = os.path.join(settings.get('git_repositories_dir'), 'trfl')
    repo = RepositoryMining(repo_path)

    commits = list(repo.traverse_commits())[:10]

    cnt_before = len(objgraph.by_type("Commit"))
    print(f'Starting with {cnt_before}')
    for n, commit in enumerate(commits):
        pickle.dumps(commit)
        print(f'#{n+1} {len(objgraph.by_type("Commit"))}')

    cnt_after = len(objgraph.by_type("Commit"))
    print(f'Ending with {cnt_after}')

    assert cnt_before == cnt_after  # Track issue on https://github.com/ishepard/pydriller/issues/102
Ejemplo n.º 8
0
class GitCommitModel(GitModel):
    def __init__(self):
        super(GitCommitModel, self).__init__()
        self.repository_miner = RepositoryMining(self.dir_path)
        self.current_commit = None
        self.current_commit_file = None

    """
    A model for listing and manipulating git commits.
    """

    def list_commits(self):
        """
        Lists commits for the current branch for the commits table.
        :return: An array of arrays with commit information inside.
        """
        _commits_with_info = []
        for commit in reversed(list(self.repository_miner.traverse_commits())):
            _commit_info = [
                commit.hash[:12], commit.msg, commit.author.name,
                commit.author_date.strftime("%d/%m/%Y, %H:%M:%S")
            ]
            _commit_info_with_hash_as_key = [_commit_info, commit]
            _commits_with_info.append(_commit_info_with_hash_as_key)
        return _commits_with_info

    def list_files_in_current_commit(self):
        _file_map = []
        if self.current_commit is not None:
            for diff in self.current_commit.modifications:
                _file_map.append([diff.new_path, diff])
        return _file_map

    def current_file_diff(self):
        _diff = ""
        if self.current_commit_file is not None:
            _diff = self.current_commit_file.diff
        return _diff

    def checkout_commit(self):
        if self.current_commit is not None:
            # self.repo.git.stash()
            self.repo.git.checkout(self.current_commit.hash)
Ejemplo n.º 9
0
 def mining(
     self,
     start_date: Optional[datetime] = None,
     end_date: Optional[datetime] = None,
     from_commit: Optional[str] = None,
     to_commit: Optional[str] = None,
 ) -> None:
     repos = RepositoryMining(
         self.__repos_path,
         since=start_date,
         to=end_date,
         from_commit=from_commit,
         to_commit=to_commit,
     )
     for commit in repos.traverse_commits():
         if self.__recorder.is_record_before(commit.hash):
             continue
         for modification in commit.modifications:
             self.__handle_modification(modification, commit.hash)
         self.__recorder.record_git_commit(commit.hash, commit.author_date)
     return None
Ejemplo n.º 10
0
    def watch_version(self):
        content = requests.get(
            'https://api.github.com/repos/Yokotes/MangaWatcher/contents/version'
        ).json()
        version_and_date = base64.b64decode(
            content['content']).decode('utf-8').split(', ')
        new_version = version_and_date[0]
        new_date = version_and_date[1]

        if self.current_version != new_version:
            repo = RepositoryMining('https://github.com/Yokotes/MangaWatcher',
                                    since=self.current_date)

            for commit in repo.traverse_commits():
                for mod in commit.modifications:
                    if not 'manga.json' in mod.filename and not 'manga_sites.json' in mod.filename:
                        try:
                            with open('./' + mod.new_path,
                                      'w',
                                      encoding='utf-8') as file:
                                file.write(mod.source_code)
                        except:
                            return None
Ejemplo n.º 11
0
class RepoMiner:
    """
    This class represents a repoMiner for analysing the historical changes in software repositories.
    """
    def __init__(self,
                 repoURL,
                 first=None,
                 second=None,
                 fromCommit=None,
                 since=None,
                 to=None):
        start = time.perf_counter()
        self.__gitRepo = GitRepository(repoURL)

        if first is not None and second is not None and since is None and to is None:
            self.repo = RepositoryMining(repoURL,
                                         from_commit=first,
                                         to_commit=second)
            self.__repo_type = RepoType.BETWEEN_COMMITS
        elif first is not None and second is None and since is None and to is None:
            self.repo = RepositoryMining(repoURL, single=first)
            self.__repo_type = RepoType.SINGLE_COMMIT
        elif first is None and second is None and since is not None and to is not None:
            try:
                date1 = parser.parse(since)
                date2 = parser.parse(to)
                self.repo = RepositoryMining(repoURL, since=date1, to=date2)
                self.__repo_type = RepoType.DATETIME
            except Exception:
                raise Exception("Entered Datetime is not valid.")
        elif fromCommit is not None:
            self.repo = RepositoryMining(path_to_repo=repoURL,
                                         from_commit=fromCommit)
            self.__repo_type = RepoType.FROM_COMMIT
        else:
            self.repo = RepositoryMining(path_to_repo=repoURL)
            self.__repo_type = RepoType.ALL

        print("repoMiner was created")

        self.__files = []  # number of analyzed files
        self.__files_with_methods = []
        self.__test_files = []  # number of test files
        self.__production_files = []  # number of production files
        self.__commits = []  # List[str] of analysed commits hash
        self.__commits_with_modified_methods = set(
        )  # List[str] of analysed commits with modified methods hash
        self.__production_methods = []  # List[ModifiedMethods]
        self.__test_methods = []  # List[ModifiedMethods]
        self.__modified_methods = []  # List[ModifiedMethods]
        self.__moved_files_without_changes = [
        ]  # List of files without changes
        self.__analyzed_commits = []  # List[AnalyzedCommits]
        self.__matched_files = []  # List of matched files
        self.__not_matched_files = None  # instance of NotMatchedFiles
        self.__GetModifications()  # performs analysis
        end = time.perf_counter()
        self.__analyse_time = "{:.2f}".format(
            (end - start) / 60)  # analysis performing time

    @property
    def analyze_time(self):
        """Returns time for analysis."""
        return self.__analyse_time

    @property
    def repo_type(self) -> RepoType:
        """
        Returns the repo type.
        """
        return self.__repo_type

    @property
    def project_name(self) -> str:
        """
        Returns the project name.
        """
        return self.__gitRepo.project_name

    @property
    def modified_methods_count(self) -> int:
        """
        Returns the number of all methods including the test methods.
        """
        # count = self.__CalculateMethodsCount(self.__all_methods_data)
        return len(self.__modified_methods)

    @property
    def production_methods_count(self) -> int:
        """
        Returns the number of methods excluding the test methods.
        """
        # count = self.__CalculateMethodsCount(self.__methods_data)
        return len(self.__production_methods)

    @property
    def test_methods_count(self) -> int:
        """
        Returns the number of test methods.
        """
        # count = self.__CalculateMethodsCount(self.__test_methods_data)
        return len(self.__test_methods)

    @property
    def files(self) -> int:
        """
        Returns number of analysed files that contain modified methods.
        """
        return len(self.__files)

    @property
    def unique_files(self):
        files = set(x for x in self.__files)
        return len(files)

    @property
    def test_files(self) -> int:
        """
        Returns the number of test files.
        """
        return len(self.__test_files)

    @property
    def unique_test_files(self):
        files = set(x for x in self.__test_files)
        return len(files)

    @property
    def production_files(self) -> int:
        """Returns number of production files."""
        return len(self.__production_files)

    @property
    def unique_production_files(self):
        files = set(x for x in self.__production_files)
        return len(files)

    @property
    def files_with_methods(self):
        """Returns number of analyzed files that contain methods."""
        return len(self.__files_with_methods)

    @property
    def unique_files_with_methods(self):
        """Returns number of unique analyzed files that contain methods."""
        files = set(x for x in self.__files_with_methods)
        return len(files)

    @property
    def commits(self) -> int:
        """Returns the number of commits."""
        count = len(self.__commits)
        return count

    @property
    def commits_with_modified_methods(self) -> int:
        """Returns the number of commits with modified methods."""
        count = len(self.__commits_with_modified_methods)
        return count

    @property
    def moved_files(self):
        """Returns a list of moves files without changes."""
        return self.__moved_files_without_changes

    @property
    def modified_methods(self) -> List[ModifiedMethod]:
        """Returns a list of all methods including the test methods."""
        return self.__modified_methods

    @property
    def production_methods(self) -> List[ModifiedMethod]:
        """Returns a list of only methods."""
        return self.__production_methods

    @property
    def highest_code_churn_methods(self) -> List[ModifiedMethod]:
        """Returns a list of recommended methods."""
        recommended_methods = self.GetRecommendedMethods(
            self.production_methods, 10, True)
        return recommended_methods

    @property
    def highest_change_frequency_methods(self) -> List[ModifiedMethod]:
        """Returns a list of recommended methods."""
        recommended_methods = self.GetRecommendedMethods(
            self.production_methods, 10, False)
        return recommended_methods

    @property
    def test_methods(self) -> List[ModifiedMethod]:
        """Returns a list of only test methods."""
        return self.__test_methods

    @property
    def matched_files(self) -> List[MatchedFiles]:
        """Returns a list of matched files between file and test file."""
        return self.__matched_files

    @property
    def matched_files_count(self) -> int:
        """Returns the number of matched files."""
        return len(self.__matched_files)

    @property
    def not_matched_files(self) -> NotMatchedFiles:
        """Returns a not matched file object of the analyze."""
        return self.__not_matched_files

    @property
    def summarized_production_methods(self) -> List[SummarizedMethod]:
        """Summarizes all instances of the same production method. Returns summarized production methods."""
        return self.SummarizeMethods(self.production_methods)

    @property
    def summarized_test_methods(self) -> List[SummarizedMethod]:
        """Summarizes all instances of the same test method. Returns summarized test methods."""
        return self.SummarizeMethods(self.test_methods, True)

    @property
    def analyzed_commits(self):
        """Returns a list of analyzed commits"""
        return self.__analyzed_commits

    def __GetModifications(self):
        """Return modified methods. (commit hash, file name, methods)"""

        print("analyze commits")
        for commit in self.repo.traverse_commits():
            commit_hash = commit.hash
            analyzed_files = []
            self.__commits.append(commit.hash)
            for file in commit.modifications:
                if JAVA_FILE_SUFFIX in file.filename:
                    print(commit.hash, file.filename)
                    self.__files.append(file.filename)
                    if file.source_code is None and file.source_code_before is None:
                        self.__moved_files_without_changes.append(
                            (commit.hash, file))
                    elif self.IsTestFile(file):
                        self.__test_files.append(file.filename)
                        test_methods = self.__AnalyseFile(
                            file, True, commit_hash)
                        analyzed_files.append(
                            AnalyzedFile(commit.hash, file, True,
                                         test_methods))
                        for method in test_methods:
                            self.__test_methods.append(method)
                            self.__modified_methods.append(method)
                    else:
                        self.__production_files.append(file.filename)
                        methods = self.__AnalyseFile(file, False, commit_hash)
                        analyzed_files.append(
                            AnalyzedFile(commit.hash, file, False, methods))
                        for method in methods:
                            self.__production_methods.append(method)
                            self.__modified_methods.append(method)

            self.__analyzed_commits.append(
                AnalyzedCommit(commit, analyzed_files))

        self.__CalculateFrequencyOfChanges(self.__modified_methods)
        if self.__repo_type == RepoType.BETWEEN_COMMITS or self.__repo_type == RepoType.SINGLE_COMMIT:
            self.__GetMatchedFiles()
        # self.__GetMultipleTimesRenamedMethods()

    @staticmethod
    def IsTestFile(file) -> bool:
        files = re.findall(r'(test.java)|(tests.java)', file.filename.lower())
        imports = re.findall(r'org.junit.*', file.source_code.lower()
                             ) if file.source_code is not None else False
        path = re.findall(
            r'src\\test',
            file.new_path.lower()) if file.new_path is not None else False

        after = any(annotation in file.source_code for annotation in java_test_annotations) if \
            file.source_code is not None else False
        before = any(annotation in file.source_code_before for annotation in java_test_annotations) if \
            file.source_code_before is not None else False

        if file.source_code is not None and file.source_code_before is not None:
            if files or imports or path or before or after:
                return True
        elif file.source_code is not None and file.source_code_before is None:
            if files or imports or path or after:
                return True
        elif file.source_code is None and file.source_code_before is not None:
            if files or imports or path or before:
                return True

        return False

    def __AnalyseFile(self, file, isTestFile,
                      commit_hash) -> List[ModifiedMethod]:
        """Analyzes a file and returns a list of modified methods."""

        deleted_lines, added_lines = self.GetLinesFromDiff(file)
        lines = [added_lines, deleted_lines]

        sourceCodeAfter = self.StoreSourceCodeAsLines(file.source_code)
        sourceCodeBefore = self.StoreSourceCodeAsLines(file.source_code_before)

        methodsAfter = self.GetMethods(file.methods, sourceCodeAfter,
                                       isTestFile, commit_hash)
        methodsBefore = self.GetMethods(file.methods_before, sourceCodeBefore,
                                        isTestFile, commit_hash)

        if file.change_type.name == "ADD":
            methods = self.GetAddedOrDeletedMethods(
                methodsAfter, ModificationType.NEWLY_ADDED, lines)

        elif file.change_type.name == "DELETE":
            methods = self.GetAddedOrDeletedMethods(
                methodsBefore, ModificationType.COMPLETELY_DELETED, lines)

        elif file.change_type.name == "RENAME":
            methods = self.GetModifiedMethods(methodsBefore, methodsAfter,
                                              lines, file.change_type.name)

        elif file.change_type.name == "MODIFY":
            methods = self.GetModifiedMethods(methodsBefore, methodsAfter,
                                              lines)

        else:
            methods = []

        if len(methods) > 0:
            self.__files_with_methods.append(file.filename)
            self.__commits_with_modified_methods.add(commit_hash)

        return methods

    @staticmethod
    def GetModifiedMethods(methodsBefore, methodsAfter, lines, fileType=None):
        """Return list of modified methods"""

        modifiedMethodsBefore = []
        modifiedMethodsAfter = []
        modifiedMethods = []

        # mapping deleted lines and methodsBefore and adding modification type
        for method in methodsBefore:
            modificationType = RepoMiner.GetTypeOfMethods(method,
                                                          lines[1],
                                                          after=False)
            modifiedMethodsBefore.append((method, modificationType))

        # mapping added lines and methodsAfter and adding modification type
        for method in methodsAfter:
            modificationType = RepoMiner.GetTypeOfMethods(method,
                                                          lines[0],
                                                          after=True)
            modifiedMethodsAfter.append((method, modificationType))

        # get renamed methods in modifiedMethodsBefore
        methodsBeforeRenamed = [
            x for x in modifiedMethodsBefore
            if x[1] == ModificationType.RENAMED
        ]

        # get renamed methods in modifiedMethodsAfter
        methodsAfterRenamed = [
            x for x in modifiedMethodsAfter if x[1] == ModificationType.RENAMED
        ]

        # get mapped renamed methods of methodsBeforeRenamed and methodsAfterRenamed
        renamedMethods = RepoMiner.GetRenamedMethods(methodsBeforeRenamed,
                                                     methodsAfterRenamed,
                                                     lines)

        # get added methods (only additions)
        newlyAddedMethods = [
            method for method in modifiedMethodsAfter
            if method[1] == ModificationType.NEWLY_ADDED
        ]

        # get deleted methods (only deletions)
        completelyDeletedMethods = [
            method for method in modifiedMethodsBefore
            if method[1] == ModificationType.COMPLETELY_DELETED
        ]

        # remove method that are already considered
        modifiedMethodsBefore = RepoMiner.RemoveMethods(
            modifiedMethodsBefore, completelyDeletedMethods)
        modifiedMethodsBefore = RepoMiner.RemoveMethods(
            modifiedMethodsBefore, methodsBeforeRenamed)
        modifiedMethodsAfter = RepoMiner.RemoveMethods(modifiedMethodsAfter,
                                                       newlyAddedMethods)
        modifiedMethodsAfter = RepoMiner.RemoveMethods(modifiedMethodsAfter,
                                                       methodsAfterRenamed)

        # mapping methodsBefore and methodsAfter
        for methodBefore in modifiedMethodsBefore:
            try:
                if fileType:
                    fileNameBefore = methodBefore[0].long_name.split('::')[0]
                    methodNameBefore = methodBefore[0].long_name.lstrip(
                        fileNameBefore).lstrip('::')
                    match = next(x for x in modifiedMethodsAfter
                                 if methodNameBefore in x[0].long_name)
                else:
                    match = next(
                        x for x in modifiedMethodsAfter
                        if methodBefore[0].long_name == x[0].long_name)

                if match[1] == ModificationType.ADDED and methodBefore[
                        1] == ModificationType.DELETED:
                    modifiedMethods.append(
                        ModifiedMethod(
                            methodBefore=methodBefore[0],
                            methodAfter=match[0],
                            modificationType=ModificationType.MODIFIED.name,
                            lines=lines))

                elif match[1] == ModificationType.UNKNOWN and methodBefore[
                        1] == ModificationType.DELETED:
                    modifiedMethods.append(
                        ModifiedMethod(
                            methodBefore=methodBefore[0],
                            methodAfter=match[0],
                            modificationType=ModificationType.DELETED.name,
                            lines=lines))

                elif match[1] == ModificationType.ADDED and methodBefore[
                        1] == ModificationType.UNKNOWN:
                    modifiedMethods.append(
                        ModifiedMethod(
                            methodBefore=methodBefore[0],
                            methodAfter=match[0],
                            modificationType=ModificationType.ADDED.name,
                            lines=lines))

            except StopIteration:
                pass

        # append newly added methods
        for method in newlyAddedMethods:
            modifiedMethods.append(
                ModifiedMethod(methodAfter=method[0],
                               modificationType=method[1].name,
                               lines=lines))

        # append completely deleted methods
        for method in completelyDeletedMethods:
            modifiedMethods.append(
                ModifiedMethod(methodBefore=method[0],
                               modificationType=method[1].name,
                               lines=lines))

        # append renamed methods
        for method in renamedMethods:
            modifiedMethods.append(method)

        return modifiedMethods

    @staticmethod
    def RemoveMethods(methods, methodsToRemove):
        """Removes certain methods from a list and the remaining list of methods."""

        for method in methodsToRemove:
            methods.remove(method)

        return methods

    @staticmethod
    def GetLinesFromDiff(file):
        """Parses diff and return deleted and added lines."""

        parsed_diff: Any = file.diff_parsed
        return parsed_diff['deleted'], parsed_diff['added']

    @staticmethod
    def StoreSourceCodeAsLines(sourceCode) -> Optional[list]:
        """Stores source code as lines and returns list of source code lines."""
        if sourceCode is not None:
            sourceCodeLines = [line for line in sourceCode.split("\n")]
        else:
            sourceCodeLines = None
        return sourceCodeLines

    @staticmethod
    def GetTypeOfMethods(method, lines, after=True) -> ModificationType:
        """Returns type of methods depending on after value."""

        codeLines = method.code_lines
        if all(codeLine in lines for codeLine in codeLines):
            return ModificationType.NEWLY_ADDED if after else ModificationType.COMPLETELY_DELETED
        elif any(codeLine in lines for codeLine in codeLines):
            if not codeLines[0] in lines:
                return ModificationType.ADDED if after else ModificationType.DELETED
            elif codeLines[0] in lines:
                return ModificationType.RENAMED
        elif not any(codeLine in lines for codeLine in codeLines):
            return ModificationType.UNKNOWN

    @staticmethod
    def GetMethods(methods, sourceCode, isTestFile,
                   commit_hash) -> List[Method]:
        """Returns list of methods."""

        listOfMethods = []
        sourceCodeLines = ''

        for method in methods:
            start = method.start_line - 1

            if isTestFile:
                if any(annotation in sourceCode[method.start_line - 2]
                       for annotation in java_test_annotations):
                    start = method.start_line - 2

            for i in range(start, method.end_line):
                sourceCodeLines += sourceCode[i] + "\n"
            listOfMethods.append(Method(method, sourceCodeLines, commit_hash))
            sourceCodeLines = ''

        return listOfMethods

    @staticmethod
    def GetAddedOrDeletedMethods(methods, modificationType,
                                 lines) -> List[ModifiedMethod]:
        """Return newly added or completely deleted methods depending on the entered methods and type."""
        if modificationType.name == "COMPLETELY_DELETED":
            addedOrDeletedMethods = [
                ModifiedMethod(methodBefore=method,
                               modificationType=modificationType.name,
                               lines=lines) for method in methods
            ]
        else:
            addedOrDeletedMethods = [
                ModifiedMethod(methodAfter=method,
                               modificationType=modificationType.name,
                               lines=lines) for method in methods
            ]
        return addedOrDeletedMethods

    @staticmethod
    def GetRenamedMethods(methodsBefore,
                          methodsAfter,
                          lines,
                          similarity=0.8) -> List[ModifiedMethod]:
        """Calculates the levenshtein distance between methods and returns a list of renamed methods."""

        MIN_SIMILARITY_SIGNATURE = similarity
        modifiedMethods = []
        methods = set()
        renamedMethodPairs = []
        notRenamedMethodPairs = []

        # consider special case:
        if len(methodsBefore) != len(methodsAfter):
            if len(methodsBefore) == 0 and len(methodsAfter) == 1:
                modifiedMethods.append(
                    ModifiedMethod(
                        methodAfter=methodsAfter[0][0],
                        modificationType=ModificationType.ADDED.name,
                        lines=lines))
                return modifiedMethods
            elif len(methodsBefore) == 1 and len(methodsAfter) == 0:
                modifiedMethods.append(
                    ModifiedMethod(
                        methodBefore=methodsBefore[0][0],
                        modificationType=ModificationType.DELETED.name,
                        lines=lines))
                return modifiedMethods

        for methodAfter in methodsAfter:
            for methodBefore in methodsBefore:
                ratio_signature = Levenshtein.ratio(methodBefore[0].signature,
                                                    methodAfter[0].signature)
                ratio_method_body = Levenshtein.ratio(
                    methodBefore[0].method_body, methodAfter[0].method_body)
                current_object = (methodBefore[0], methodAfter[0],
                                  ratio_signature, ratio_method_body)

                if ratio_signature >= 0.9:
                    MIN_SIMILARITY_METHOD_BODY = similarity
                else:
                    MIN_SIMILARITY_METHOD_BODY = (
                        (1 - similarity) / 2) + similarity

                if len(methodsBefore) == 1 and len(methodsAfter) == 1:
                    renamedMethodPairs.append(current_object)
                elif ratio_signature >= MIN_SIMILARITY_SIGNATURE and ratio_method_body >= MIN_SIMILARITY_METHOD_BODY:
                    if methodBefore[0] in [x[0] for x in renamedMethodPairs]:
                        match = [
                            x for x in renamedMethodPairs
                            if methodBefore[0] == x[0]
                        ]
                        if current_object[2] > match[0][2]:
                            index = renamedMethodPairs.index(match[0])
                            renamedMethodPairs[index] = current_object
                    else:
                        renamedMethodPairs.append(current_object)
                else:
                    notRenamedMethodPairs.append(current_object)

        for method in renamedMethodPairs:
            if method[0] not in methods and method[1] not in methods:
                modifiedMethods.append(
                    ModifiedMethod(
                        methodBefore=method[0],
                        methodAfter=method[1],
                        modificationType=ModificationType.RENAMED.name,
                        ratio=(method[2], method[3]),
                        lines=lines))
                methods.add(method[0])
                methods.add(method[1])

        for method in notRenamedMethodPairs:
            if method[0] not in methods:
                modifiedMethods.append(
                    ModifiedMethod(
                        methodBefore=method[0],
                        modificationType=ModificationType.DELETED.name,
                        lines=lines))
                methods.add(method[0])

            if method[1] not in methods:
                modifiedMethods.append(
                    ModifiedMethod(
                        methodAfter=method[1],
                        modificationType=ModificationType.ADDED.name,
                        lines=lines))
                methods.add(method[1])

        return modifiedMethods

    @staticmethod
    def __CalculateFrequencyOfChanges(methods):
        """Calculate the change frequency for each method."""

        print("calculate change frequency")
        for x in methods:
            x.change_frequency = sum(
                x.long_name == y.long_name and x.file_name == y.file_name
                for y in methods)

    @staticmethod
    def SummarizeMethods(methods, isTestfile=None) -> List[SummarizedMethod]:
        """Summarizes the same method instances."""

        summarized_methods = []
        method_names = set(x.long_name for x in methods)

        for method_name in method_names:
            method = [x for x in methods if x.long_name == method_name]

            if isTestfile:
                summarized_methods.append(
                    SummarizedMethod(method, isTestMethod=True))
            else:
                summarized_methods.append(
                    SummarizedMethod(method, isTestMethod=False))

        return summarized_methods

    @staticmethod
    def __SortMethodsByChangedFrequency(methods) -> List[ModifiedMethod]:
        """Returns sorted list of methods by change frequency."""

        methods.sort(key=lambda x: x.change_frequency, reverse=True)

        return methods

    @staticmethod
    def GetRecommendedMethods(methods, number, code_churn):
        """Returns a list of recommended methods."""

        recommended_methods = []
        filteredMethod = None

        if number > len(methods):
            print("number is bigger then identified methods")
            number = len(methods)

        for i in range(0, number):
            max_value = 0
            for method in methods:
                value = method.code_churn if code_churn else method.change_frequency
                if value > max_value:
                    max_value = value
                    filteredMethod = method

            methods = [
                x for x in methods if x.long_name != filteredMethod.long_name
            ]
            recommended_methods.append(filteredMethod)

        return recommended_methods

    def __GetMatchedFiles(self):
        """Determines corresponding files and test files. Returns matched files."""

        print("calculate matched files")
        methods = self.production_methods
        test_methods = self.test_methods

        summarized_methods = self.SummarizeMethods(methods)
        summarized_test_method = self.SummarizeMethods(test_methods, True)

        methods_file_names = set(method.file_name for method in methods)
        test_methods_file_names = set(method.file_name
                                      for method in test_methods)

        matched, not_matched = self.GetMatchedFileNames(
            methods_file_names, test_methods_file_names)

        for match in matched:
            sublist_methods = []
            sublist_test_methods = []
            for method in summarized_methods:
                if match[0] == method.summarized_method.file_name:
                    sublist_methods.append(method.summarized_method)

            for method in summarized_test_method:
                if match[1] == method.summarized_test_method.file_name:
                    sublist_test_methods.append(method.summarized_test_method)

            self.__matched_files.append(
                MatchedFiles(match[0], match[1], sublist_methods,
                             sublist_test_methods))

        not_matched_methods = [
            x for x in methods if x.file_name in not_matched[0]
        ]
        not_matched_test_methods = [
            x for x in test_methods if x.file_name in not_matched[1]
        ]

        self.__not_matched_files = NotMatchedFiles(not_matched[0],
                                                   not_matched[1],
                                                   not_matched_methods,
                                                   not_matched_test_methods)

    @staticmethod
    def GetMatchedFileNames(production_file_names, test_file_names):
        """Find pairs of corresponding file and test file names. Returns identified pairs."""

        matched = []

        for file_name in production_file_names:
            name = re.sub('<.*>', '', file_name)
            regEx1 = r"" + name.lower().split(".")[0] + r"test.java"
            regEx2 = r"" + name.lower().split(".")[0] + r"testcase.java"
            regEx3 = r"" + name.lower().split(".")[0] + r"tests.java"

            for test_file_name in test_file_names:
                match1 = re.fullmatch(regEx1, test_file_name.lower())
                match2 = re.fullmatch(regEx2, test_file_name.lower())
                match3 = re.fullmatch(regEx3, test_file_name.lower())
                if match1 or match2 or match3:
                    matched.append((file_name, test_file_name))

        matched_files = [y[0] for y in matched]
        matched_test_files = [y[1] for y in matched]

        not_match_files = [
            x for x in production_file_names if x not in matched_files
        ]
        not_match_test_files = [
            x for x in test_file_names if x not in matched_test_files
        ]

        not_matched = [not_match_files, not_match_test_files]

        print("len matched:", len(matched))

        return matched, not_matched

    # TODO: muss noch getestet werden
    def __GetMultipleTimesRenamedMethods(self, ):
        """Searches for each method multiple corresponding renamed methods."""

        all_methods = self.modified_methods
        all_renamed_methods = [x for x in all_methods if x.type == "RENAMED"]

        for x in all_renamed_methods:
            found_a_renamed_method = True
            renamed_method = x
            while found_a_renamed_method:
                found_a_renamed_method, renamed_method = self.__FindRenamedMethods(
                    renamed_method, all_renamed_methods)
                if renamed_method is not None:
                    x.renamed_methods.append(renamed_method)
                    x.multiple_renamed = True

    @staticmethod
    def __FindRenamedMethods(method, methods):
        renamed_method = None
        for x in methods:
            if method.signature == x.old_signature and x.ratio_signature < 1.0:
                renamed_method = x
                break

        if renamed_method is not None:
            return True, renamed_method
        else:
            return False, None
Ejemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("from_tag")
    parser.add_argument("to_tag")
    args = parser.parse_args()

    pr_commits = []
    dependency_updates = []
    migrations = []

    mine = RepositoryMining(".", from_tag=args.from_tag, to_tag=args.to_tag)
    for commit in mine.traverse_commits():
        for mod in commit.modifications:
            if mod.change_type == ModificationType.ADD and "/migrations/" in mod.new_path:
                migrations.append(get_migration_desc(mod))

        if not commit.merge:
            # Not a PR merge
            continue
        commit.prs = get_pr_numbers(commit)
        if not commit.prs:
            continue

        if is_dependency_update(commit):
            dependency_updates.append(commit)
        else:
            pr_commits.append(commit)

    # output

    print(f"# Version {args.to_tag}")
    print()
    print(f"## PRs merged since {args.from_tag}")
    print()

    if not pr_commits:
        print("None")

    for commit in pr_commits:
        for pr in commit.prs:
            print(
                f"* [PR {pr}]({PR_URL_TMPL.format(pr)}): {get_pr_title(commit, pr)}"
            )

    print()
    print("## Dependency updates")
    print()

    if not dependency_updates:
        print("None")

    for commit in dependency_updates:
        for pr in commit.prs:
            print(
                f"* [PR {pr}]({PR_URL_TMPL.format(pr)}): {get_pr_title(commit, pr)}"
            )

    print()
    print("## Migrations")
    print()

    if not migrations:
        print("None")

    for migration in migrations:
        print(f"* {migration['path']} - {migration['description']}")
Ejemplo n.º 13
0
    value += 1
    value %= 13
    return 1 if value == 0 else value


for repo in repos:
    if repo != "generator-jhipster":
        continue
    repository_folder = "{}{}/".format(repo_folder, repo)
    print(repository_folder)
    repository = RepositoryMining(repository_folder,
                                  only_modifications_with_file_types=['.java'])

    selected_commits = dict()

    commits = repository.traverse_commits()

    with open("../out/{}.csv".format(repo.replace(".git", "")), 'w') as output:

        conditional_tag = re.compile(r'\+@Conditional.*\(')
        profile_tag = re.compile(r'\+@Profile.*\(')
        conditional_remove_tag = re.compile(r'-@Conditional.*\(')
        profile_remove_tag = re.compile(r'-@Profile.*\(')
        conds = dict()

        repo = RepositoryMining(repo)

        print("DATE, +@Conditional, +@Profile, -@Conditional, -@Profile")
        print("DATE, +@Conditional, +@Profile, -@Conditional, -@Profile",
              file=output)
        for commit in commits:
single_java_vuln_commits = []
for repo in repos:
    repo_url = repo["url"]
    clean_local_repo(repo_url)
    print(f'Cloning {repo_url} ...', end=' ')
    try:
        repo_mining = RepositoryMining(repo_url,
                                       clone_repo_to="/tmp",
                                       order='reverse',
                                       only_in_branch='master')
    except GitCommandError as e:
        print('cannot mine. Skipping.')
        logging.error(e)
    else:
        try:
            commits = list(repo_mining.traverse_commits())
            print("done!")
            vuln_commits = [c for c in commits if c.hash in repo["hashes"]]
            if vuln_commits:
                for vc in vuln_commits:
                    if is_single_java(vc):
                        head_msg = vc.msg.split("\n", 1)[0]
                        single_java_vuln_commits.append({
                            "project": repo_url,
                            "hash": vc.hash
                        })
                        print(f'\t{vc.hash} is interesting!')
                    else:
                        print(f'\t{vc.hash} found but not interesting.')
            else:
                print('\tcommit(s) not found :(')
class ExtractSourceFilesInfo:

    # repository_path = name of the repository (same of github, and same of local repo)
    # path_to_file = path to the file, in order to avoid the computation of test files
    def __init__(self, repository_path, path_to_file):
        self._repository = RepositoryMining("https://www.github.com/" + repository_path + ".git")
        self._repository_path = repository_path
        self._path_to_file = path_to_file
        self._classNames = []

    # This function creates the file-developers dictionary
    def getFileDevDictionary(self):
        # dictionary instance
        commitDict = dict()

        # Iterating the commits...
        for commit in self._repository.traverse_commits():
            # N.B. Each commit may contain more than one modification: this is because a developer may modify more than
            # one file, and so may commit more modified file.
            # Iterating the modifications in the commit...
            for m in commit.modifications:

                # if the filename of the modification 'm' isn't already in the dictionary, let's add it as key of
                # commitDict: the corresponding value is another dictionary!
                # commitDict = {'filename': {} }
                if m.filename not in commitDict:
                    commitDict[m.filename] = dict()

                # if the author modify the file 'filename' for the FIRST TIME, let's put the author name as a key of
                # the internal dictionary
                # (in turn, it is the value of the corresponding filename of the commitDict dictionary)
                # and '1' as value: this value will be the counter of times that the author modify that file.!
                if commit.author.name not in commitDict[m.filename]:
                    commitDict[m.filename][commit.author.name] = 1

                # if the author modifiy the file 'filename' for the SECOND TIME (or more), let's increase the
                # corresponding value!
                else:
                    commitDict[m.filename][commit.author.name] += 1

        # Create the graph
        y = nx.Graph()

        file_name_list = []
        committer_list = []
        for x, committers in commitDict.items():
            file_name_list.append(x)
            for committer, num_commit in committers.items():
                if committer not in committer_list:
                    committer_list.append(committer)

        # Add edges to the graph
        y.add_nodes_from(file_name_list, bipartite=0)
        y.add_nodes_from(committer_list, bipartite=1)

        list_to_add = []
        for filename, committers in commitDict.items():
            for committer, num_commit in committers.items():
                list_to_add.append((filename, committer))
        y.add_edges_from(list_to_add)
        pos = nx.spring_layout(y, k=0.4, iterations=20)
        nx.draw_networkx_nodes(y, pos, node_size=40)
        nx.draw_networkx_edges(y, pos, edgelist=y.edges, edge_color="b", style="solid")
        nx.draw_networkx_labels(y, pos, font_size=7, font_family="sans-serif")

        # Show the graph
        plt.axis("off")
        plt.figure(figsize=(10, 8), dpi=300)
        plt.show()

        return commitDict

    # This function creates the file-file developers dictionary
    def getFileFileDictionary(self):
        repo_dir = self._repository_path + "/" + self._path_to_file
        subprocess.call(
            ['java', '-jar', 'depends/depends.jar', 'java', repo_dir, 'outputDep', '--auto-include',
             '-d=depends'])

    def getFileFileMatrix(self):
        self.getFileFileDictionary()
        with open("depends/outputDep.json") as f:
            data = json.load(f)

        # Get class names of the entire project
        name_of_classes = list()
        for key in data['variables']:
            filename = pathlib.PureWindowsPath(key)

            # Convert path to the right format for the current operating system
            path = pathlib.PurePath(filename)
            name_of_classes.append(path.name)

        self._classNames = name_of_classes

        dependencies = list()
        dependenciesRow = list()

        # Iterating all the pairs of classes that have dependencies: index goes from 0 to n (#number of classes)
        for i in range(0, len(data["variables"])):
            # Iterating all classes (from 0 to n)
            for j in range(0, len(data["variables"])):
                # Boolean variable that tell us whether any dependencies are found
                noDependencies = True
                # Iterating the dependencies found by "Depends":
                for index in range(0, len(data["cells"])):
                    # If there are dependencies from the class indexed as 'i'...
                    if (data["cells"][index]["src"] == i):
                        # ...to the class indexed as 'j'
                        if (data["cells"][index]["dest"] == j):
                            # DEPENDENCY FOUND! Put the boolean = False and compute the sum of the dependencies!
                            noDependencies = False
                            dependenciesRow.append(sum(data["cells"][index]["values"].values()))
                # No dependencies between the class 'i' and the class 'j': put 0 in the list
                if (noDependencies):
                    dependenciesRow.append(0)

            # We are going to the next row, this means that 'i' is going to change (another class is going to be
            # analyzed): let's copy in a support list the 'partialDepencies' list, in order to save results in the
            # 'dependencies' matrix, and re-use the 'dependenciesRow' list in another iteration!
            supportList = deepcopy(dependenciesRow)  # copy
            del dependenciesRow[:]  # empty the list
            dependencies.extend([supportList])  # dependencies matrix filling

        k = 0
        dict_to_return = dict()
        for class_name in name_of_classes:
            j = 0
            dict_to_return[class_name] = dict()
            for class_name_2 in name_of_classes:
                if dependencies[k][j] > 0:
                    dict_to_return[class_name][class_name_2] = dependencies[k][j]
                j = j + 1
            k = k + 1

        # Create the graph
        y = nx.Graph()
        for file, file_dep in dict_to_return.items():
            for file2, val in file_dep.items():
                y.add_edge(file, file2, weight=val)

        # Add the edges to the graph
        pos = nx.spring_layout(y)
        nx.draw_networkx_nodes(y, pos, node_size=70)
        nx.draw_networkx_edges(y, pos, edgelist=y.edges, edge_color="b", style="solid")
        nx.draw_networkx_labels(y, pos, font_size=5, font_family="sans-serif")

        # Print the graph
        plt.axis("off")
        plt.show()

        return dependencies, name_of_classes

    def getFileDevMatrix(self):
        # Getting data
        data = self.getFileDevDictionary()

        # Get all file names
        fileNames = (list)(data.keys())
        devNames = []

        # Get all developers names
        for file in self._classNames:
            for key in data[file].keys():
                if key not in devNames:
                    # print(key)
                    devNames.append(key)

        # File dev matrix
        fileDevMatrix = list()

        # A list, used for each row of the matrix: at each iteration is used and then empty, in order
        # to re-use it in the next iteration
        fileDevRow = []

        # Iterating file names
        for i in range(0, len(self._classNames)):
            # Iterating developers names
            for j in range(0, len(devNames)):
                # If a developer name is in the dictionary associated to a certain file... (this means that he made
                # at least 1 commit on that file
                if (devNames[j] in data[self._classNames[i]]):
                    # append the number of commits on that file
                    fileDevRow.append(data[self._classNames[i]][devNames[j]])
                else:  # otherwise put 0
                    fileDevRow.append(0)

            # We are going to the next row, this means that 'i' is going to change (another file is going to be
            # analyzed): let's copy in a support list the 'fileDevRow' list, in order to save results in the
            # matrix, and re-use the 'fileDevRow' list in another iteration!
            supportList = deepcopy(fileDevRow)  # copy
            del fileDevRow[:]  # empty the list
            fileDevMatrix.extend([supportList])  # matrix filling

        return fileDevMatrix, devNames
Ejemplo n.º 16
0
class ArgumentCommits():
    def __init__(self, urls=[]):
        self.mining_object = RepositoryMining(
            urls,
            only_in_main_branch=True,
            only_modifications_with_file_types=['.java'])
        self.commit_records = defaultdict(list)
        self.result_for_csv = ''
        self.result_for_tsv = ''

    def fetch_commit_data(self):
        '''
      This routine fetches the commit data of the specified repository
      and runs static analysis on the code files in the commit.
    '''
        for commit in self.mining_object.traverse_commits():
            for mod in commit.modifications:
                if mod.filename[
                        -5:] == '.java':  # get only Java files from the commit
                    commit_code_details = lizard.analyze_file.analyze_source_code(
                        mod.filename, mod.source_code)

                    for func in commit_code_details.function_list:
                        d = func.__dict__
                        if 'for(' in d["long_name"]:
                            continue  # handling the case where 'for()' is considered a function by lizard.

                        self.commit_records[
                            f'{mod.filename}~{d["name"]}'].append({
                                'hash':
                                commit.hash,
                                'doc':
                                commit.author_date.strftime(
                                    "%Y-%m-%d %H:%M:%S"
                                ),  # doc == date of commit
                                'current_signature':
                                d["long_name"],
                                'args':
                                d["parameters"]
                            })

    def find_commits_with_additional_parameters(self):
        '''
      This routine finds the commits where one or more arguments was added to 
      a function.
    '''
        print("Fetching commits and running static analysis: In Progress...")
        self.fetch_commit_data()
        print("Fetching commits and running static analysis: Done!")
        print("Finding Commits with Additional Parameters: In Progress...")
        for key, value in self.commit_records.items():
            value.sort(key=lambda el: parse(el["doc"])
                       )  # sort the commits by date of commit
            file_name = key.split('~')[0]
            l = len(value)
            current_signature = value[0]['current_signature']

            for i in range(0, l - 1):
                if len(value[i + 1]['args']) > len(value[i]['args']) and value[
                        i +
                        1]["current_signature"] != current_signature and value[
                            i + 1]["hash"] != value[i]["hash"]:
                    self.result_for_csv += f'{value[i + 1]["hash"]},{file_name},{value[i]["current_signature"]},{value[i + 1]["current_signature"]}\n'
                    self.result_for_tsv += f'{value[i + 1]["hash"]}\t{file_name}\t{value[i]["current_signature"]}\t{value[i + 1]["current_signature"]}\n'
                    current_signature = value[i + 1]["current_signature"]

        print("Finding Commits with Additional Parameters: Done!")

    def write_to_csv(self, filename="default_file_name"):
        with open(f'{filename}.csv', 'w') as file:
            file.write(
                'Commit SHA,Java File,Old function signature,New function signature\n'
            )
            for line in self.result_for_csv:
                file.write(line)

    def write_to_tsv(self, filename="default_file_name"):
        with open(f'{filename}.tsv', 'w') as file:
            file.write(
                'Commit SHA\tJava File\tOld function signature\tNew function signature\n'
            )
            for line in self.result_for_tsv:
                file.write(line)
Ejemplo n.º 17
0
local directory containing Java project repositories
"""

home = str(Path.home())
# @TODO add this to configuration
# Path containing GitHub repositories of Java projects
dataset_path = os.path.join(home, "ml4se_dataset", "unique_mining")
project_names = os.listdir(dataset_path)

print("Starting to look through projects")
for project_name in project_names:
    path = os.path.join(dataset_path, project_name)
    repository_mining = RepositoryMining(path)
    # print("Starting to analyze commits for {}".format(path))
    try:
        for commit in repository_mining.traverse_commits():
            gr = GitRepository(repository_mining.path_to_repo[0])
            for modified_file in commit.modifications:
                if modified_file.filename.endswith(".java"):
                    diff = modified_file.diff
                    parsed_diff = gr.parse_diff(diff)

                    lines_containing_less_or_equal = []
                    lines_containing_less = []
                    lines_containing_greater_or_equal = []
                    lines_containing_greater = []
                    for deletion in parsed_diff['deleted']:
                        line_nr = deletion[0]
                        content = deletion[1]
                        if " <= " in content:
                            lines_containing_less_or_equal.append(line_nr)
Ejemplo n.º 18
0
def search_repository(repo_mining: pd.RepositoryMining, severity: Level, confidence: Level):
    """
    Iterate through all commits of the given repository from the given revision (default: active branch)

    :param repo_mining: The RepositoryMining object
    :param severity: The minimum severity level of vulnerabilities
    :param confidence: The minimum confidence level of vulnerabilities
    """

    output = {}
    gitpython_repo = git.Repo(repo_mining._path_to_repo)
    commit_count = len(list(copy.copy(repo_mining).traverse_commits()))

    for commit in tqdm(repo_mining.traverse_commits(), total=commit_count):
        # Too many files changed will cause the program to hang
        if len(commit.modifications) > 100:
            continue

        commit_message = process_commit_message(commit.msg)
        output[commit.hash] = {'date': str(commit.author_date)}

        # Find matching vulnerabilities
        output[commit.hash]['vulnerabilities'] = [{'name': vulnerability.name, 'match': regex_match.group()}
                                                  for vulnerability in vuln.vulnerability_list
                                                  for regex_match in [vulnerability.regex.search(commit_message)]
                                                  if regex_match]
        if not output[commit.hash]['vulnerabilities']:
            output[commit.hash].pop('vulnerabilities')

        # Add files changed, each modification is a file changed
        for modification in commit.modifications:
            file = modification.old_path if modification.change_type.name is 'DELETE' else modification.new_path
            file_extension = os.path.splitext(file)[1].lower()

            # Skip this file if the file extension is not supported
            if file_extension not in lang.supported_extensions:
                continue

            source_code_dict = get_source_code_dict(gitpython_repo, commit.hash, file, modification.source_code)

            # Encoding will be None or not supported by decode() for some cases
            if not source_code_dict:
                continue

            diff = repo.parse_diff(modification.diff)

            # Run Flawfinder for C/C++ files
            if file_extension in c_lang.c_extensions:
                partial_output = run_flawfinder(diff, source_code_dict, severity, confidence)

            # Run bandit for Python files
            elif file_extension in py_lang.py_extensions:
                partial_output = run_bandit(diff, source_code_dict, severity, confidence)

            # Run 'grep'-like analysis for other languages files (very noisy)
            else:
                diff['unchanged'] = get_unchanged_lines(diff, source_code_dict)
                partial_output = process_diff(diff, file_extension, severity, confidence)

            # Only add the file if it has useful code changes (comments already removed)
            if partial_output:
                if 'files_changed' not in output[commit.hash]:
                    output[commit.hash]['files_changed'] = []

                output[commit.hash]['files_changed'].append({'file': file, **partial_output})

        # Remove the commit if regex doesnt match or no vulnerable lines of code are detected
        if 'vulnerabilities' not in output[commit.hash] and 'files_changed' not in output[commit.hash]:
            output.pop(commit.hash)

    return output
Ejemplo n.º 19
0
    def ExtractFromCommits(since, to, url, excludes):
        # limit to time of writing script for reproduceable results
        commits = RepositoryMining(path_to_repo=url, since=since ,to=to)

        # Data extraction variables
        project_name = ""
        count = 0
        merges = 0
        all_authors = []
        author_commit_dict = dict()
        internal_authors = []
        external_authors = []
        code_changes = []
        iac_changes = []
        excluded_files = []
        for commit in commits.traverse_commits():
            if project_name == "":
                project_name = commit.project_name 
            msg = commit.msg

            author = commit.author.email
            org_author = commit.committer.email
            count = count + 1
            
            if commit.merge:
                merges = merges + 1

            # extract files in this commit
            changedFiles = commit.modifications

            # remove files that match exclude paths
            files_for_analysis = []
            for file in changedFiles:
                path = ""

                if not file.new_path == None:
                    path = file.new_path
                else:
                    path = file.old_path
                
                addToCollection = True
                for exclude_path in excludes:
                    # maybe handle wildcard here
                    if exclude_path in path:
                        #if file in files_for_analysis:
                            #files_for_analysis.remove(file)
                        addToCollection = False
                        if (file.filename, exclude_path) not in excluded_files:
                            excluded_files.append((file.filename, exclude_path))
                if addToCollection:
                    files_for_analysis.append(file)

            #print("removed", changedFiles.__len__() - files_for_analysis.__len__())

            for file in files_for_analysis:
                filename = file.filename

                loc = file.nloc
                
                if not loc == None:
                    # code files
                    lines_added = file.added
                    lines_removed = file.removed
                    code_changes.append((commit.hash, author, filename, msg, lines_added, lines_removed, org_author))

                else:
                    # documentation and IAC files
                    lines_added = file.added
                    lines_removed = file.removed
                    iac_changes.append((commit.hash, author, filename, msg, lines_added, lines_removed, org_author))

            # Create overall collection of all authors independnt of company
            if not all_authors.__contains__(author):
                all_authors.append(author)
                author_commit_dict[author] = 1
            else:
                author_commit_dict[author] = author_commit_dict[author] + 1

            # split into internal and external
            if author.__contains__("eficode") or author.__contains__("praqma"):
                if not internal_authors.__contains__(author):
                    internal_authors.append(author)
            else:
                if not external_authors.__contains__(author):
                    external_authors.append(author)

        return (project_name, count, merges, all_authors, author_commit_dict, internal_authors, external_authors, code_changes, iac_changes, excluded_files)
Ejemplo n.º 20
0
class RepositoryProcessor:
    def __init__(self, repository: str, owner: str):
        self.owner = owner
        self.repository = os.path.split(repository)[-1]
        self.repo = GitRepository(repository)
        self.mining = RepositoryMining(repository)
        self.pairs = []
        random.seed(42)

    def run(self):
        self.get_all_filepairs()
        with open(os.path.join('filepairs', self.repository, 'pairs.txt'),
                  'w') as f:
            f.write('\n'.join(
                map(lambda x: f'{x[0]} {x[1]} {x[2]}', self.pairs)))
        f.write('\n')

    def get_all_filepairs(self, file_filter=java_file_filter):
        commits = list(
            filter(lambda x: not x.merge, self.mining.traverse_commits()))
        for commit in commits:
            for modification in commit.modifications:
                if modification.change_type == ModificationType.MODIFY:
                    if file_filter(modification.filename):
                        self.get_file_pair(commit, modification)

    def get_file_pair(self, commit, modification: Modification):
        parent = commit.parents[0]

        repo = self.repo.project_name
        commit_hash = commit.hash
        filename = modification.filename

        path = os.path.join('filepairs', repo, commit_hash, filename)
        os.makedirs(path, exist_ok=True)

        self.repo.checkout(parent)
        before = os.path.join(self.repository, modification.old_path)
        before_saved = os.path.join(path,
                                    'before_' + commit_hash + '_' + filename)
        copyfile(before, before_saved)

        self.repo.checkout(commit_hash)
        after = os.path.join(self.repository, modification.new_path)
        after_saved = os.path.join(path,
                                   'after__' + commit_hash + '_' + filename)
        copyfile(after, after_saved)

        self.pairs.append(
            (before_saved, after_saved,
             commit_hash + '.' + self.owner + '.' + before.replace('/', '.')))

    def run_random(self, number):
        self.get_random_filepairs(number)
        with open(os.path.join('filepairs', self.repository, 'pairs.txt'),
                  'w') as f:
            f.write('\n'.join(
                map(lambda x: f'{x[0]} {x[1]} {x[2]}', self.pairs)))
            f.write('\n')

    def get_random_filepairs(self, number, file_filter=java_file_filter):
        commits = random.choices(list(
            filter(lambda x: not x.merge, self.mining.traverse_commits())),
                                 k=number)
        for idx, commit in enumerate(commits):
            print(f'Processing commit №{idx}: {commit.hash}')
            for modification in commit.modifications:
                if modification.change_type == ModificationType.MODIFY:
                    if file_filter(modification.filename):
                        self.get_file_pair(commit, modification)
Ejemplo n.º 21
0
        if new_path != path:
            path = new_path
        else:
            return None

    return None


if __name__ == "__main__":
    current_repo()

if __name__ == "__main__":
    book_miner = RepositoryMining(current_repo())

if __name__ == "__main__":
    book_commits = book_miner.traverse_commits()
    book_first_commit = next(book_commits)

if __name__ == "__main__":
    [attr for attr in dir(book_first_commit) if not attr.startswith('_')]

if __name__ == "__main__":
    book_first_commit.msg

if __name__ == "__main__":
    [
        attr for attr in dir(book_first_commit.author)
        if not attr.startswith('_')
    ]

if __name__ == "__main__":
Ejemplo n.º 22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("from_tag")
    parser.add_argument("to_tag")
    parser.add_argument("--verbose", "-v", action="count", default=0)
    args = parser.parse_args()

    log_level = logging.WARNING
    if args.verbose >= 2:
        log_level = logging.DEBUG
    elif args.verbose >= 1:
        log_level = logging.INFO

    logging.config.dictConfig({
        "version": 1,
        "disable_existing_loggers": False,
        "formatters": {
            "development": {
                "format": "%(levelname)s %(name)s: %(message)s"
            }
        },
        "handlers": {
            "console": {
                "level": "DEBUG",
                "class": "logging.StreamHandler",
                "stream": sys.stderr,
                "formatter": "development",
            }
        },
        "root": {
            "handlers": ["console"],
            "level": "WARNING"
        },
        "loggers": {
            "normandy": {
                "propagate": False,
                "handlers": ["console"],
                "level": log_level
            }
        },
    })

    pr_commits = []
    dependency_updates = []
    migrations = []

    mine = RepositoryMining(".", from_tag=args.from_tag, to_tag=args.to_tag)
    num_commits_processed = 0
    for commit in mine.traverse_commits():
        for mod in commit.modifications:
            if mod.change_type == ModificationType.ADD and "/migrations/" in mod.new_path:
                migrations.append(get_migration_desc(mod))

        if not commit.merge:
            log.debug(f"Skipping {commit.hash[:7]}: Not a merge")
            # Not a PR merge
            continue
        commit.prs = get_pr_numbers(commit)
        if not commit.prs:
            log.debug(f"Skipping {commit.hash[:7]}: No PR numbers")
            continue

        if is_dependency_update(commit):
            log.debug(
                f"Processing commit {commit.hash[:7]} as dependency commit")
            dependency_updates.append(commit)
        else:
            log.debug(f"Processing commit {commit.hash[:7]} as normal commit")
            pr_commits.append(commit)
        num_commits_processed += 1

    if num_commits_processed == 0:
        log.error("No commits processed")
        raise Exception("No commits processed")

    # Accrue output in a buffer and print all at once so that log lines don't pollute it
    output = ""

    def output_line(line=""):
        nonlocal output
        output += line + "\n"

    output_line(f"# Version {args.to_tag}")
    output_line()
    output_line(f"## PRs merged since {args.from_tag}")
    output_line()

    if not pr_commits:
        output_line("None")

    for commit in pr_commits:
        for pr in commit.prs:
            output_line(
                f"* [PR {pr}]({PR_URL_TMPL.format(pr)}): {get_pr_title(commit, pr)}"
            )

    output_line()
    output_line("## Dependency updates")
    output_line()

    if not dependency_updates:
        output_line("None")

    for commit in dependency_updates:
        for pr in commit.prs:
            print(
                f"* [PR {pr}]({PR_URL_TMPL.format(pr)}): {get_pr_title(commit, pr)}"
            )

    output_line()
    output_line("## Migrations")
    output_line()

    if not migrations:
        output_line("None")

    for migration in migrations:
        output_line(f"* {migration['path']} - {migration['description']}")

    print(f"\n\n{output}")
    print(f"detecting branch... ", end='', flush=True)

    found_default_branch = False

    for default_branch in branches_search_chain:

        if found_default_branch:
            break

        try:
            repo = RepositoryMining(repo_path,
                                    only_in_branch=default_branch,
                                    only_no_merge=True)

            for commit in repo.traverse_commits():
                check = commit.msg

                if check is not None:
                    found_default_branch = True
                    branch = default_branch
                    print(f"using {branch}", flush=True)
                    break

        except git.exc.GitCommandError as gce:
            continue

    if not found_default_branch:
        print(f"Can't detect default branch", flush=True)
        sys.exit(1)
else: