Esempio n. 1
0
def get_patches(
    repository: pygit2.Repository,
    initial: Optional[str] = None,
    terminal: Optional[str] = None,
) -> List[PatchInfo]:
    """
    Returns a list of patches taking the given repository from the initial revision to the terminal
    one.
    """
    rev_initial = initial
    rev_terminal = terminal
    if rev_initial is None:
        rev_initial = "HEAD"
    elif initial == NULL_REVISION:
        rev_initial = get_empty_tree_hash(repository)
        if terminal is None:
            rev_terminal = "HEAD"

    if terminal == NULL_REVISION:
        rev_terminal = get_empty_tree_hash(repository)

    diff = repository.diff(a=rev_initial, b=rev_terminal, context_lines=0)

    # We have to handle the case where initial=NULL_REVISION and terminal=None separately because of
    # the lack of tracked file objects in the empty revision and how git handles the default empty
    # argument for the terminal revision (checks against tracked files in initial revision).
    if initial == NULL_REVISION and terminal is None:
        status_diff = repository.diff()
        diff.merge(status_diff)

    patches = [
        PatchInfo(
            old_file=patch.delta.old_file.path,
            new_file=patch.delta.new_file.path,
            hunks=[process_hunk(hunk) for hunk in patch.hunks],
        )
        for patch in diff
    ]

    for patch in patches:
        old_filepath = os.path.join(repository.workdir, patch.old_file)
        new_filepath = os.path.join(repository.workdir, patch.new_file)
        old_source = None
        if initial != NULL_REVISION:
            old_source = revision_file(repository, rev_initial, old_filepath)
        new_source = None
        if terminal != NULL_REVISION:
            new_source = revision_file(repository, terminal, new_filepath)
        # The following awkward workaround is because mypy-protobuf has weird behaviour around
        # fields. They are defined as optional in the __init__ method of the message class, but not
        # optional as attributes of the message class.
        if old_source is not None:
            patch.old_source = old_source
        if new_source is not None:
            patch.new_source = new_source

    return patches
class Artifact(object):
    def __init__(self, keyword, config={}, allow_dirty=False):
        self.keyword = keyword
        self.hashed_config = hash(''.join(
            ['{}{}'.format(k, v) for k, v in config.items()]))
        self.allow_dirty = allow_dirty
        self.repo = Repository(discover_repository(os.getcwd()))
        self.dirty_suffix = '-dirty'
        self.is_dirty = self.repo.diff().stats.files_changed > 0
        if self.is_dirty and self.allow_dirty:
            logger.warn('Repository has unstaged changes. '
                        'Creating artifacts, but marking them dirty.')
        elif self.is_dirty:
            raise OSError(
                'Refusing to create artifacts with a dirty repository.')

        self.commitish = str(self.repo.head.resolve().target)[:7] + (
            '-dirty' if self.is_dirty else '')

    def __call__(self, f):
        def rewritten(*args, **kwargs):
            output = kwargs.get(self.keyword) or ''
            if not output:
                raise KeyError('%s was not in kwargs for decorated function',
                               self.keyword)
            parts = os.path.split(output)
            fname, ext = parts[-1].split(os.path.extsep)
            with_info = fname + '-' + self.commitish + '-' + str(
                self.hashed_config)[:10] + os.path.extsep + ext
            kwargs[self.keyword] = os.path.join(*(list(parts)[:-1] +
                                                  [with_info]))
            return f(*args, **kwargs)

        return rewritten
Esempio n. 3
0
def get_git_info(git_working_tree_dir):
    repository_path = discover_repository(git_working_tree_dir)
    assert repository_path is not None

    repo = Repository(repository_path)
    commits = list(repo.walk(repo.head.target, GIT_SORT_NONE))
    head_commit = commits[0]
    diff = repo.diff()

    git_info = {
        'head_commit': {
            'hash': head_commit.hex,
            'message': head_commit.message,
            'author': head_commit.author.name
        },
        'branch': {
            'name': repo.head.shorthand
        },
        'stats': {
            'files_changed': diff.stats.files_changed,
        },
        'num_commits': len(commits)
    }

    return git_info
Esempio n. 4
0
class GitAccessor(ScmAccessor):
    def __init__(self, repo_path, start_rev=None):
        super().__init__(repo_path=repo_path, start_rev=start_rev)
        self._scm = Repository(path.join(repo_path))

    def get_log(self):
        data = []

        for c in self._scm.walk(self._scm.head.target, GIT_SORT_TIME):
            print("Processing commit %s" % c.id)

            diff = self._scm.diff(c, c.parents[0]).stats.format(
                GIT_DIFF_STATS_FULL, 1) if c.parents else ""

            diff = diff.splitlines()
            if len(diff) >= 1:
                diff = diff[:-1]

            stripped_diff = [d.split("|")[0].strip() for d in diff]

            e = LogEntry()
            e.id = c.id
            e.msg = c.message.strip("\n")
            e.author = c.committer.name
            e.email = c.committer.email
            e.time = datetime.fromtimestamp(c.commit_time)
            e.diff = stripped_diff
            data.append(e)

            if self._start_rev and c.id.hex == self._start_rev:
                break

        return data
Esempio n. 5
0
def get_changed_contrib_names() -> List[str]:
    """Get all changed files as compared to remote/main"""
    repo = Repository(root_path)
    main_branch = repo.lookup_branch('main')
    if main_branch is None:
        raise RuntimeError("Can't find `main` branch to compare to.")

    file_paths = set(
        itertools.chain.from_iterable(
            (patch.delta.old_file.path, patch.delta.new_file.path)
            for patch in repo.diff(a=main_branch)))
    changed_contribs = set()

    for filepath in file_paths:
        if '__pycache__' not in filepath:
            path_parents = (root_path / Path(filepath)).parents
            for contrib_path in contrib_paths:
                if contrib_path in path_parents:
                    changed_contribs.add(contrib_path.name)
            if test_path in path_parents:
                for contrib_path in contrib_paths:
                    if filepath.endswith(f'test_{contrib_path.parts[-1]}.py'):
                        changed_contribs.add(contrib_path.name)

    return list(changed_contribs)
Esempio n. 6
0
def parse_diffusion_features(pid, repo_path, branch, start, stop=-1):
    """
    Function to extract diffusion features from a set of commits.
    """
    repo = Repository(repo_path)

    head = repo.references.get(branch)
    commits = list(
        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))

    start = start - 1 if (start > 0) else start
    commits = commits[start:stop] if (stop != -1) else commits[start:]

    features = [[] for c in range(len(commits))]
    for i, commit in enumerate(tqdm(commits[1:], position=pid)):
        diff = repo.diff(commits[i], commit)

        patches = [p for p in diff]

        # Extract all different subsystems that have been modified
        modules = set([])
        subsystems_mapping = {}
        entropy_change = 0

        file_changes = []
        total_change = 0
        for patch in patches:
            # Skip binary files
            if patch.delta.is_binary:
                continue
            _, addition, deletions = patch.line_stats
            total_change = total_change + (addition + deletions)
            file_changes.append(addition + deletions)

            # Store all subsystems
            fpath = patch.delta.new_file.path
            subsystems = fpath.split('/')[:-1]

            root = subsystems_mapping
            for system in subsystems:
                if system not in root:
                    root[system] = {}
                root = root[system]
            if subsystems > 0:
                modules.add(subsystems[0])

        # Check how many subsystems that have been touched
        modified_systems = count_diffing_subsystems(subsystems_mapping)

        # Calculate the entropy for the commit
        entropy_change = count_entropy(file_changes, total_change)

        # Add all features
        features[i].append(str(commit.hex))
        features[i].append(str(float(modified_systems)))
        features[i].append(str(float(len(modules))))
        features[i].append(str(float(entropy_change)))

    RES[pid] = features
Esempio n. 7
0
def get_and_update_repo_cache(repo_path):
    cache_filename = '%s-stats.cache' % repo_path
    if os.path.exists(cache_filename):
        with open(cache_filename) as f:
            data = load(f)
    else:
        data = {
            'author_to_month_to_additions': defaultdict(defaultdict_int),
            'author_to_month_to_deletions': defaultdict(defaultdict_int),
            'author_to_month_to_commits': defaultdict(defaultdict_int),
            'day_to_count': defaultdict(defaultdict_int),
            'latest_sha': None,
        }

    repo = Repository(repo_path)

    count = 0
    for commit in repo.walk(repo.head.target, GIT_SORT_TOPOLOGICAL):
        count += 1
        if commit.type == GIT_OBJ_COMMIT:
            if data['latest_sha'] == commit.hex:
                break
        
            if not commit.message.lower().startswith('merge'):
                try:
                    d = repo.diff('%s^' % commit.hex, commit)
                except KeyError:
                    # First commit!
                    break
                patches = list(d)
                additions = sum([p.additions for p in patches])
                deletions = sum([p.deletions for p in patches])

                author = author_aliases.get(commit.author.email, commit.author.email)

                day = date.fromtimestamp(commit.commit_time)
                data['day_to_count']['Lines'][day] += additions
                data['day_to_count']['Lines'][day] -= deletions

                if additions > 1000 and deletions < 5 and commit.hex not in whitelist_commits:
                    if commit.hex not in blacklist_commits:
                        print 'WARNING: ignored %s looks like an embedding of a lib (message: %s)' % (commit.hex, commit.message)
                    continue
                if (additions > 3000 or deletions > 3000) and commit.hex not in whitelist_commits:
                    if commit.hex not in blacklist_commits and additions != deletions:  # Guess that if additions == deletions it's a big rename of files
                        print 'WARNING: ignored %s because it is bigger than 3k lines. Put this commit in the whitelist or the blacklist (message: %s)' % (commit.hex, commit.message)
                    continue
                month = date(day.year, day.month, 1)
                data['author_to_month_to_additions'][author][month] += additions
                data['author_to_month_to_deletions'][author][month] += deletions
                data['author_to_month_to_commits'][author][month] += 1
                if data['latest_sha'] is None:
                    data['latest_sha'] = commit.hex

    with open(cache_filename, 'w') as f:
        dump(data, f)

    return data
Esempio n. 8
0
def getShortStat(obj, prevId, thisId):
    """
	returns (files_changed, insertions, deletions)
	"""
    repo = Repository(obj.vcsDir)
    diff = repo.diff(
        a=repo.revparse_single(prevId).id.hex,
        b=repo.revparse_single(thisId).id.hex,
    )
    stats = diff.stats
    return (stats.files_changed, stats.insertions, stats.deletions)
Esempio n. 9
0
def getCommitShortStat(obj, commit_id):
    """
	returns (files_changed, insertions, deletions)
	"""
    repo = Repository(obj.vcsDir)
    commit = repo.revparse_single(commit_id)
    diff = repo.diff(
        a=commit.parent_ids[0].hex,
        b=commit.id.hex,
    )
    stats = diff.stats
    return (stats.files_changed, stats.insertions, stats.deletions)
Esempio n. 10
0
def processGitDiff(commitsNum):
  counter = commitsNum;
  repositoryName = "../git-repos/postgres"
  repo = Repository(repositoryName +"/"+ ".git")
  childCommitNumber = ""
  for commit in repo.walk(repo.head.target, GIT_SORT_TIME):
    counter-=1;
    if counter<0:
      break
    currentCommitNumber = commit.oid.hex
    if(childCommitNumber!=""):
        diff = repo.diff(currentCommitNumber, childCommitNumber)
        fileChanges = 0;
        for p in diff:
          print(p.old_file_path)
              #print(p.old_oid)
          print(p.new_file_path)
              #print(p.new_oid)
          #print(p.additions)
          addLines = 0;
          deleteLines = 0;
          for hunk in p.hunks:
                  #print(hunk.old_start)
                  #print(hunk.old_lines)
                  #print(hunk.new_start)
                  #print(hunk.new_lines)
            for line in hunk.lines:
              if line[0] == "+":
                addLines+=1;
              if line[0] == "-":
                deleteLines+=1;
          print("lines added" + str(addLines));
          print("lines deleted" + str(deleteLines));
        fileChanges+=1
        print("file changed" + str(fileChanges));
    childCommitNumber = commit.oid.hex;
Esempio n. 11
0
class CollectGit(object):
    """
    Small Helper class for small repositories.
    This does not scale because we hold a lot of data in memory.
    """

    _regex_comment = re.compile(
        r"(//[^\"\n\r]*(?:\"[^\"\n\r]*\"[^\"\n\r]*)*[\r\n]|/\*([^*]|\*(?!/))*?\*/)(?=[^\"]*(?:\"[^\"]*\"[^\"]*)*$)"
    )
    _regex_jdoc_line = re.compile(r"(- |\+)\s*(\*|/\*).*")

    def __init__(self, path):
        if not path.endswith('.git'):
            if not path.endswith('/'):
                path += '/'
            path += '.git'
        self._log = logging.getLogger(self.__class__.__name__)
        self._path = path
        self._repo = Repository(self._path)
        self._hunks = {}

        self._file_actions = {}
        self._bugfix = {}
        self._msgs = {}
        self._days = {}
        self._cdays = {}
        self._branches = {}
        self._tags = {}

        self._dopts = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES
        self._SIMILARITY_THRESHOLD = 50
        self._graph = nx.DiGraph()

    @classmethod
    def clone_repo(cls, uri, local_path):
        project_name = uri.split('/')[-1].split('.git')[0]
        repo_path = local_path + '/' + project_name + '/'

        if os.path.isdir(repo_path):
            c = subprocess.run(['git', 'fetch'],
                               cwd=repo_path,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
            if c.returncode != 0:
                err = 'Error pulling repository {} to {}'.format(
                    uri, repo_path)
                raise Exception(err)
        else:
            os.mkdir(repo_path)
            c = subprocess.run(['git', 'clone', uri, repo_path],
                               cwd=repo_path,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
            if c.returncode != 0:
                err = 'Error cloning repository {} to {}'.format(
                    uri, repo_path)
                raise Exception(err)
        return repo_path

    def _changed_lines(self, hunk):
        added_lines = []
        deleted_lines = []

        del_line = hunk['old_start']
        add_line = hunk['new_start']

        for line in hunk['content'].split('\n'):

            tmp = line[1:].strip()
            # is_comment = tmp.startswith('//') or tmp.startswith('/*') or tmp.startswith('*')

            if line.startswith('+'):
                added_lines.append((add_line, tmp))
                del_line -= 1
            if line.startswith('-'):
                deleted_lines.append((del_line, tmp))
                add_line -= 1

            del_line += 1
            add_line += 1

        return added_lines, deleted_lines

    def _comment_only_change(self, content):
        content = content + '\n'  # required for regex to drop comments
        content = re.sub(self._regex_comment, "", content)
        removed = ''
        added = ''
        for line in content.split('\n'):
            line = re.sub(
                r"\s+", " ", line, flags=re.UNICODE
            )  # replace all kinds of whitespaces (also multiple) with sińgle whitespace
            if not re.match(self._regex_jdoc_line, line):
                if line.startswith('-'):
                    removed += line[1:].strip()
                elif line.startswith('+'):
                    added += line[1:].strip()
        return removed == added

    def _blame_lines(self,
                     revision_hash,
                     filepath,
                     strategy,
                     ignore_lines=False,
                     validated_bugfix_lines=False):
        """We want to find changed lines for one file in one commit (from the previous commit).

        For this we are iterating over the diff and counting the lines that are deleted (changed) from the original file.
        We ignore all added lines.

        ignore_lines is already specific to all changed hunks of the file for which blame_lines is called
        """
        c = self._repo.revparse_single('{}'.format(revision_hash))
        self._hunks[revision_hash] = self._get_hunks(c)

        changed_lines = []
        if revision_hash not in self._hunks.keys(
        ) or not self._hunks[revision_hash]:
            return changed_lines

        for h in self._hunks[revision_hash]:
            if h['new_file'] != filepath:
                continue

            # only whitespace or comment changes in the hunk, ignore
            if strategy == 'code_only' and self._comment_only_change(
                    h['content']):
                self._log.debug(
                    'detected whitepace or comment only change in {} for {}'.
                    format(revision_hash, filepath))
                continue

            added, deleted = self._changed_lines(h)
            for dt in deleted:
                if dt not in changed_lines and dt[1]:
                    if strategy == 'code_only' and dt[1].startswith(
                        ('//', '/*', '*')):
                        continue

                    # we may only want validated lines
                    if validated_bugfix_lines is not False:
                        if dt[0] not in validated_bugfix_lines:
                            continue

                    # we may ignore lines, e.g., refactorings
                    if ignore_lines:
                        ignore = False
                        for start_line, end_line in ignore_lines:
                            if start_line <= dt[0] <= end_line:
                                ignore = True
                                break

                        # if we hit the line in our ignore list we continue to the next
                        if ignore:
                            # self._log.warn('ignore line {} in file {} in commit {} because of refactoring detection'.format(dt[0], filepath, revision_hash))
                            continue

                    changed_lines.append(dt)

        return changed_lines

    def blame(self,
              revision_hash,
              filepath,
              strategy='code_only',
              ignore_lines=False,
              validated_bugfix_lines=False):
        """Collect a list of commits where the given revision and file were last changed.

        Uses git blame.

        :param str revision_hash: Commit for which we want to collect blame commits.
        :param str filepath: File for which we want to collect blame commits.
        :rtype: list
        :returns: A list of tuples of blame commits and the original file for the given parameters.
        """
        commits = []

        # - ignore if commit is not in graph
        if revision_hash not in self._graph:
            return []

        # # - ignore package-info.java
        # if strategy == 'code_only' and filepath.lower().endswith('package-info.java'):
        #     self._log.debug('skipping blame on revision: {} for file {} because it is package-info.java'.format(revision_hash, filepath))
        #     return []

        # # - ignore test/ /test/ example/ examples/
        # if strategy == 'code_only' and re.match(self._regex_test_example, filepath):
        #     self._log.debug('skipping blame on revision: {} for file {} because it is a test or an example'.format(revision_hash, filepath))
        #     return []

        # bail on multiple parents
        parents = list(self._graph.predecessors(revision_hash))
        if len(parents) > 1:
            self._log.debug(
                'skipping blame on revision: {} because it is a merge commit'.
                format(revision_hash))
            return []

        changed_lines = self._blame_lines(revision_hash, filepath, strategy,
                                          ignore_lines, validated_bugfix_lines)
        parent_commit = self._repo.revparse_single('{}^'.format(revision_hash))

        blame = self._repo.blame(filepath,
                                 flags=GIT_BLAME_TRACK_COPIES_SAME_FILE,
                                 newest_commit=parent_commit.hex)
        for lineno, line in changed_lines:
            # returns blamehunk for specific line
            try:
                bh = blame.for_line(lineno)
            except IndexError as e:
                # this happens when we have the wrong parent node
                bla = 'tried to get file: {}, line: {}, revision: {}, blame commit: {}'.format(
                    filepath, lineno, revision_hash, str(bh.orig_commit_id))
                self._log.error(bla)
                raise  # this is critical

            inducing_commit = self._repo.revparse_single(str(
                bh.orig_commit_id))

            # start = bh.orig_start_line_number
            # lines = bh.lines_in_hunk
            # final_start = bh.final_start_line_number
            # print(revision_hash, '->', inducing_commit.hex)
            # print('original: {}: {}'.format(lineno, line))
            # print('{},{}: {},{}'.format(start, lines, final_start, lines))

            # blame_lines = []
            # for hunk in self._hunks[inducing_commit.hex]:
            #     if hunk['new_file'] != bh.orig_path:
            #         continue
            #     ls = final_start
            #     for i, blame_line in enumerate(hunk['content'].split('\n')):
            #         if blame_line[1:].strip() and line[1:].strip() and blame_line[1:] == line[1:]:
            #             print('blame: {}:{}'.format(ls, blame_line))
            #         ls += 1
            commits.append((inducing_commit.hex, bh.orig_path))

        # make unique
        return list(set(commits))

    def commit_information(self, revision_hash):
        obj = self._repo.get(revision_hash)

        return {
            'author_name':
            obj.author.name,
            'author_email':
            obj.author.email,
            'committer_name':
            obj.committer.name,
            'committer_email':
            obj.committer.email,
            'committer_date_utc':
            datetime.fromtimestamp(obj.commit_time, tz=timezone.utc),
            'committer_date':
            obj.commit_time,
            'committer_date_offset':
            obj.commit_time_offset,
            'message':
            obj.message,
            'file_actions':
            self._file_actions[revision_hash]
        }

    def file_actions(self, revision_hash):
        return self._file_actions[revision_hash]

    def all_files(self, revision_hash):
        # 1. checkout repo
        self._checkout_revision(revision_hash)

        # 2. list files
        return self._list_files()

    def first_occurence(self, filename):
        # file rename tracking is not possible currently in libgit, see:
        # https://github.com/libgit2/libgit2/issues/3041

        # find first occurence of file with git cli

        # git log --follow --diff-filter=A --find-renames=40% foo.js
        path = self._path.replace('.git', '')
        c = subprocess.run([
            'git', 'log', '--all', '--pretty=tformat:"%H %ci"', '--follow',
            '--diff-filter=A', '--find-renames=80%', '--', filename
        ],
                           cwd=path,
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE)
        if c.returncode != 0:
            err = 'Error finding first occurrence of file: {}'.format(filename)
            self._log.error(err)
            self._log.error(c.stderr)
            raise Exception(err)

        full = c.stdout.decode('utf-8')
        try:
            first_line = full.split('\n')[-2]
        except IndexError:
            if not full:
                print('no git log for file {}'.format(filename))
            print(full)
            raise
        first_date = ' '.join(first_line.split(' ')[1:]).replace('"', '')
        dt = datetime.strptime(
            first_date, '%Y-%m-%d %H:%M:%S %z'
        )  # we can do this here because we control the input format, %z does not cover +01:00 just +100 (at least in 3.6)
        return dt

    def tags(self):
        regex = re.compile('^refs/tags')
        ret = []
        for tagref in filter(lambda r: regex.match(r),
                             self._repo.listall_references()):
            tag = self._repo.lookup_reference(tagref)
            target = self._repo.lookup_reference(tagref).peel()
            ret.append({
                'name': tag.name.replace('refs/tags/', ''),
                'revision_hash': target.id
            })
        return ret

    def _checkout_revision(self, revision):
        """Checkout via shell, we ignore stdout output."""
        path = self._path.replace('.git', '')
        c = subprocess.run(['git', 'checkout', '-q', '-f', revision],
                           cwd=path,
                           stdout=subprocess.PIPE)
        return c.returncode == 0

    def _list_files(self):
        """The slower list_files"""
        path = self._path.replace('.git', '')

        ret = []
        for root, dirs, files in os.walk(path):
            for file in files:
                filepath = os.path.join(root, file)
                relative_filepath = filepath.replace(path, '')
                ret.append(relative_filepath)
        return ret

    def _list_files2(self):
        """The faster list_files (relies on find command)"""
        path = self._path.replace('.git', '')
        lines = subprocess.check_output(['find', '.', '-iname', '*.java'],
                                        cwd=path)

        files = []
        for f in lines.decode('utf-8').split('\n'):
            if f.lower().endswith('.java'):
                files.append(f.replace('./', ''))

        return files

    def _get_hunks(self, commit):
        diffs = []
        hunks = []

        # for initial commit (or orphan commits) pygit2 needs some special attention
        initial = False
        if not commit.parents:
            initial = True
            diffs.append((None,
                          commit.tree.diff_to_tree(context_lines=0,
                                                   interhunk_lines=1)))

        # we may have multiple parents (merge commit)
        for parent in commit.parents:
            # we need all information from each parent because in a merge each parent may add different files
            tmp = self._repo.diff(parent,
                                  commit,
                                  context_lines=0,
                                  interhunk_lines=1)
            tmp.find_similar(self._dopts, self._SIMILARITY_THRESHOLD,
                             self._SIMILARITY_THRESHOLD)
            diffs.append((parent.hex, tmp))

        for parent, diff in diffs:
            checked_paths = set()
            for patch in diff:
                if patch.delta.new_file.path in checked_paths:
                    self._log.warn('already have {} in checked_paths'.format(
                        patch.delta.new_file.path))
                    continue
                mode = 'X'
                if patch.delta.status == 1:
                    mode = 'A'
                elif patch.delta.status == 2:
                    mode = 'D'
                elif patch.delta.status == 3:
                    mode = 'M'
                elif patch.delta.status == 4:
                    mode = 'R'
                elif patch.delta.status == 5:
                    mode = 'C'
                elif patch.delta.status == 6:
                    mode = 'I'
                elif patch.delta.status == 7:
                    mode = 'U'
                elif patch.delta.status == 8:
                    mode = 'T'

                # diff to tree gives D for inital commit otherwise
                if initial:
                    mode = 'A'

                # we may have hunks to add
                if patch.hunks and commit.hex not in self._hunks.keys():
                    self._hunks[commit.hex] = []

                # add hunks
                for hunk in patch.hunks:
                    # initial is special case
                    if initial:
                        content = ''.join(
                            ['+' + l.content for l in hunk.lines])
                        hunks.append({
                            'header': hunk.header,
                            'new_file': patch.delta.new_file.path,
                            'new_start': hunk.old_start,
                            'new_lines': hunk.old_lines,
                            'old_start': hunk.new_start,
                            'old_lines': hunk.new_lines,
                            'content': content
                        })
                    else:
                        content = ''.join(
                            [l.origin + l.content for l in hunk.lines])
                        hunks.append({
                            'header': hunk.header,
                            'new_file': patch.delta.new_file.path,
                            'new_start': hunk.new_start,
                            'new_lines': hunk.new_lines,
                            'old_start': hunk.old_start,
                            'old_lines': hunk.old_lines,
                            'content': content
                        })
        return hunks

    def _changed_files(self, commit):
        changed_files = []
        diffs = []

        # for initial commit (or orphan commits) pygit2 needs some special attention
        initial = False
        if not commit.parents:
            initial = True
            diffs.append((None,
                          commit.tree.diff_to_tree(context_lines=0,
                                                   interhunk_lines=1)))

        # we may have multiple parents (merge commit)
        for parent in commit.parents:
            # we need all information from each parent because in a merge each parent may add different files
            tmp = self._repo.diff(parent,
                                  commit,
                                  context_lines=0,
                                  interhunk_lines=1)
            tmp.find_similar(self._dopts, self._SIMILARITY_THRESHOLD,
                             self._SIMILARITY_THRESHOLD)
            diffs.append((parent.hex, tmp))

        for parent, diff in diffs:
            checked_paths = set()
            for patch in diff:
                if patch.delta.new_file.path in checked_paths:
                    self._log.warn('already have {} in checked_paths'.format(
                        patch.delta.new_file.path))
                    continue
                mode = 'X'
                if patch.delta.status == 1:
                    mode = 'A'
                elif patch.delta.status == 2:
                    mode = 'D'
                elif patch.delta.status == 3:
                    mode = 'M'
                elif patch.delta.status == 4:
                    mode = 'R'
                elif patch.delta.status == 5:
                    mode = 'C'
                elif patch.delta.status == 6:
                    mode = 'I'
                elif patch.delta.status == 7:
                    mode = 'U'
                elif patch.delta.status == 8:
                    mode = 'T'

                # diff to tree gives D for inital commit otherwise
                if initial:
                    mode = 'A'

                # we may have hunks to add
                if patch.hunks and commit.hex not in self._hunks.keys():
                    self._hunks[commit.hex] = []

                # add hunks
                for hunk in patch.hunks:
                    # initial is special case
                    if initial:
                        content = ''.join(
                            ['+' + l.content for l in hunk.lines])
                        self._hunks[commit.hex].append({
                            'header':
                            hunk.header,
                            'new_file':
                            patch.delta.new_file.path,
                            'new_start':
                            hunk.old_start,
                            'new_lines':
                            hunk.old_lines,
                            'old_start':
                            hunk.new_start,
                            'old_lines':
                            hunk.new_lines,
                            'content':
                            content
                        })
                    else:
                        content = ''.join(
                            [l.origin + l.content for l in hunk.lines])
                        self._hunks[commit.hex].append({
                            'header':
                            hunk.header,
                            'new_file':
                            patch.delta.new_file.path,
                            'new_start':
                            hunk.new_start,
                            'new_lines':
                            hunk.new_lines,
                            'old_start':
                            hunk.old_start,
                            'old_lines':
                            hunk.old_lines,
                            'content':
                            content
                        })

                # collect line stats
                if initial:
                    fa = {
                        'lines_added': patch.line_stats[2],
                        'lines_deleted': patch.line_stats[1],
                        'changeset_size': len(diff),
                        'parent': None
                    }
                else:
                    fa = {
                        'lines_added': patch.line_stats[1],
                        'lines_deleted': patch.line_stats[2],
                        'changeset_size': len(diff),
                        'parent': parent
                    }

                #if mode == 'R':
                #    print('R {} -> {}, sim: {}'.format(patch.delta.old_file.path, patch.delta.new_file.path, patch.delta.similarity))

                if mode in ['C', 'R']:
                    changed_file = [
                        mode, patch.delta.new_file.path,
                        patch.delta.old_file.path, fa
                    ]
                else:
                    changed_file = [mode, patch.delta.new_file.path, None, fa]

                checked_paths.add(patch.delta.new_file.path)
                changed_files.append(changed_file)
        return changed_files

    def collect(self):
        # list all branches
        for branch in list(self._repo.branches):
            self._collect_branch(branch)

        # list all tags
        for obj in self._repo:
            tag = self._repo[obj]
            if tag.type == GIT_OBJ_TAG:
                self._collect_branch(tag, is_tag=True)

        return self._graph

    def _collect_branch(self, branch, is_tag=False):
        if type(branch) == str:
            branch = self._repo.branches[branch]

        # add nodes to graph
        try:
            for c in self._repo.walk(branch.target):
                self._graph.add_node(c.hex)

                # branch stuff, used for traversing backwards for tags in svn->git conversions
                # if c.hex not in self._branches.keys():
                #     self._branches[c.hex] = []

                # what about tags which are also on branches?
                # if is_tag:
                #     self._tags[c.hex] = branch.name
                # else:
                #     self._branches[c.hex].append(branch.branch_name)

                # add msg
                # self._msgs[c.hex] = c.message

                # add days, we use this later for lookup
                # day = str(datetime.fromtimestamp(c.commit_time, tz=timezone.utc).date())
                # if day not in self._days.keys():
                #     self._days[day] = []
                # self._days[day].append(c.hex)

                # add for convenience for OntdekBaanBfs
                # self._cdays[c.hex] = day

                # add changed files per node
                # if c.hex not in self._file_actions.keys():
                #     self._file_actions[c.hex] = self._changed_files(c)

                # still too expensive
                # self._create_hunks(c)

            # add edges to graph
            for c in self._repo.walk(branch.target):
                for p in c.parents:
                    self._graph.add_edge(p.hex, c.hex)
        except ValueError as e:
            pass
Esempio n. 12
0
def process(repo, history):
	# GET A REPO ON DISK
	base = Repository(repo)
	base.checkout('HEAD')

	file_xsmall = 0
	file_small = 0
	file_medium = 0
	file_large = 0
	file_xlarge = 0
		
	hunk_xsmall = 0
	hunk_small = 0
	hunk_medium = 0
	hunk_large = 0
	hunk_xlarge = 0

	line_xsmall = 0
	line_small = 0
	line_medium = 0
	line_large = 0
	line_xlarge = 0 
	
	i = 0
	while i < len(history) - 1:
		print '\rDiff#: ' + str(i + 1) + ' of ' + str(len(history)-1),

		t0 = base.revparse_single(history[i].hex)
		t1 = base.revparse_single(history[i+1].hex)
		
		try:
			diff = base.diff(t0,t1)
		except ValueError:
			print ''
			print 'Value Error'
			print ''
			i += 1
			continue
		
		files = [p for p in diff]
		
		if len(files) == 1:
			file_xsmall += 1
		if len(files) >= 2 and len(files) <= 4:
			file_small += 1
		if len(files) >= 5 and len(files) <= 7:
			file_medium += 1
		if len(files) >= 8 and len(files) <= 10:
			file_large += 1
		if len(files) >= 11:
			file_xlarge += 1
		
		hunksInCommit = 0
		linesInCommit = 0

		for modfile in files:
			hunksInCommit += len(modfile.hunks)
			for hunk in modfile.hunks:
				for line in hunk.lines:
					if line[0] == '-' or line[0] == '+':
						linesInCommit += 1


		if hunksInCommit <= 1:
			hunk_xsmall += 1
		if hunksInCommit >= 2 and hunksInCommit <= 8:
			hunk_small += 1
		if hunksInCommit >= 9 and hunksInCommit <= 17:
			hunk_medium += 1
		if hunksInCommit >= 18 and hunksInCommit <= 26:
			hunk_large += 1
		if hunksInCommit >= 27:
			hunk_xlarge += 1

		if linesInCommit <= 5:
			line_xsmall += 1
		if linesInCommit >= 6 and linesInCommit <= 46:
			line_small += 1
		if linesInCommit >= 47 and linesInCommit <= 106:
			line_medium += 1
		if linesInCommit >= 107 and linesInCommit <= 166:
			line_large += 1
		if linesInCommit >= 167:
			line_xlarge += 1

		i += 1
	print ''

	ts = time.time()
	st = datetime.datetime.fromtimestamp(ts).strftime('-%Y-%m-%d.%H.%M.%S')
	name = repo.replace('/.git', '') + st + '.txt'
	output = open(name,'w')

	output.write('--------- ' + repo + ' ----------' + '\n')
	output.write('Number of Lines Modified:' + '\n')
	output.write('x-small: ' + str( + line_xsmall) + '\n')
	output.write('small: ' + str(line_small) + '\n')
	output.write('medium: ' + str(line_medium) + '\n')
	output.write('large: ' + str(line_large) + '\n')
	output.write('x-large: ' + str(line_xlarge) + '\n')

	output.write('Number of Files Modified:' + '\n')
	output.write('x-small: ' + str(file_xsmall) + '\n')
	output.write('small: ' + str(file_small) + '\n')
	output.write('medium: ' + str(file_medium) + '\n')
	output.write('large: ' + str(file_large) + '\n')
	output.write('x-large: ' + str(file_xlarge) + '\n')

	output.write('Number of Hunks Per Commit' + '\n')
	output.write('x-small: ' + str(hunk_xsmall) + '\n')
	output.write('small: ' + str(hunk_small) + '\n')
	output.write('medium: ' + str(hunk_medium) + '\n')
	output.write('large: ' + str(hunk_large) + '\n')
	output.write('x-large: ' + str(hunk_xlarge) + '\n')

	output.close()
Esempio n. 13
0
class GitMiner(BaseMiner):
    Id_Name_Login = namedtuple("Id_Name_Login", ["id", "name", "login"])
    Code_Change = namedtuple("Code_Change", ["commit_id", "filename"])

    def __init__(self, args):
        super().__init__(args)

        self._initialise_db()

        if args.dbms == "sqlite":
            self._conn.execute("PRAGMA foreign_keys=ON")

        self.email_map = {}
        self.commit_id = {}
        self.id_commit = {}
        self.code_change_map = {}

        self.__init_user_emails()

        self._dump_repository()

        self.aio = args.aio

        if self.aio:
            self._create_loop()

        self.repo = Repository(args.path)
        self._fetch_references()
        self._dump_tags()
        self._fetch_commit_ids()

    def _create_loop(self):
        self.loop = asyncio.new_event_loop()

    def load_from_file(self, file):
        pass

    def dump_to_file(self, path):
        pass

    def __init_user_emails(self):
        res = self.execute_query(
            """
            SELECT email, id, login, name
            FROM contributors
            WHERE email IS NOT NULL 
            """
        ).fetchall()

        for row in res:
            self.email_map[row[0]] = self.Id_Name_Login(id=row[1], name=row[2], login=row[3])

    def __init_code_change(self):
        res = self.execute_query(
            """
            SELECT id, commit_id, filename
            FROM code_change
            """
        ).fetchall()

        for row in res:
            self.code_change_map[self.Code_Change(commit_id=row[1], filename=row[2])] = row[0]

    def _dump_repository(self):
        logger.info("Dumping Repository...")

        res = self.execute_query(
            f"""
            SELECT repo_id 
            FROM repository
            WHERE name="{self.repo_name}" and owner="{self.repo_owner}"
            """
        ).fetchone()

        if res:
            self._set_repo_id(res[0])
        else:
            repo = RepositoryStruct(
                name=self.repo_name,
                owner=self.repo_owner
            ).process()

            obj = self.db_schema.repository_object(
                name=self.repo_name,
                owner=self.repo_owner,
                created_at=repo.created_at,
                updated_at=repo.updated_at,
                description=repo.description,
                disk_usage=repo.disk_usage,
                fork_count=repo.fork_count,
                url=repo.url,
                homepage_url=repo.homepage_url,
                primary_language=repo.primary_language,
                total_stargazers=repo.stargazer_count,
                total_watchers=repo.watcher_count,
                forked_from=repo.forked_from
            )

            self._insert(self.db_schema.repository.insert(), obj)
            self._set_repo_id()

    def _fetch_references(self):
        self.tags, self.branches = [], {}
        for reference in self.repo.listall_references():
            if 'refs/tags' in reference:
                self.tags.append(reference)
            else:
                self.branches[reference] = self.repo.lookup_reference(reference).peel().oid

    def _dump_tags(self):
        objects = []

        for tag in self.tags:
            ref = self.repo.lookup_reference(tag)
            tag_obj = self.repo[ref.target.hex]

            if isinstance(tag_obj, Tag):
                name = tag_obj.name
                msg = tag_obj.message
                tagged_object = tag_obj.hex
                tagger = self.__get_user_id(name=tag_obj.tagger.name, email=tag_obj.tagger.email, oid=tagged_object,
                                            is_author=False, is_tagger=True)
            else:
                name = tag.split('/')[-1]
                msg = tag_obj.message
                tagged_object = tag_obj.hex
                tagger = self.__get_user_id(name=tag_obj.author.name, email=tag_obj.author.email, oid=tagged_object,
                                            is_author=True, is_tagger=False)

            obj = self.db_schema.tags_object(
                name=name,
                tagged_object=tagged_object,
                message=msg,
                tagger=tagger
            )

            objects.append(obj)

        self._insert(object_=self.db_schema.tags.insert(), param=objects)

    @staticmethod
    def __get_status(status):
        if status == 1:
            return 'ADDED'
        elif status == 2:
            return 'DELETED'
        elif status == 3:
            return 'MODIFIED'
        elif status == 4:
            return 'RENAMED'
        elif status == 5:
            return 'COPIED'
        elif status == 6:
            return 'IGNORED'
        elif status == 7:
            return 'UNTRACKED'
        elif status == 8:
            return 'TYPECHANGED'
        else:
            return None

    def __init_commits(self, inverse=False):
        if not inverse:
            res = self.execute_query(
                f"""
                SELECT oid, id
                FROM commits
                WHERE repo_id={self.repo_id}
                """
            ).fetchall()

            for row in res:
                self.commit_id[row[0]] = row[1]
        else:
            res = self._conn.execute(
                f"""
                SELECT id, oid
                FROM commits
                WHERE repo_id={self.repo_id}
                """
            ).fetchall()

            for row in res:
                self.id_commit[row[0]] = row[1]

    def __get_commit_id(self, oid, pk=None):
        if not pk:
            try:
                return self.commit_id[oid]
            except KeyError:
                return None
        else:
            try:
                return self.id_commit[pk]
            except KeyError:
                self.__init_commits(inverse=True)
                res = self.__get_commit_id(oid=None, pk=pk)
                if not res:
                    raise Exception(f"GitMiner => __get_commit_id: Pk {pk} does not exist!")
                else:
                    return res

    def __check_user_id(self, email):
        try:
            map_ = self.email_map[email]
            return [map_.id, map_.login, map_.name]
        except KeyError:
            res = self.execute_query(
                f"""
                SELECT id, login, name
                FROM contributors
                WHERE email="{email}"
                """
            ).fetchone()

            if res:
                self.email_map[email] = self.Id_Name_Login(id=res[0], login=res[1], name=res[2])

            return res

    def __update_contributor(self, name, id_, login, email):
        name = name.replace('"', '""')

        self.execute_query(
            f"""
            UPDATE contributors
            SET name="{name}"
            WHERE id={id_}
            """
        )

        self.email_map[email] = self.Id_Name_Login(id=id_, login=login, name=name)

    def __get_user_id(self, name, email, oid, is_author, is_tagger):
        if not email:
            email = None

        if not name:
            name = None

        res = self.__check_user_id(email)

        if not res:
            user = CommitUserStruct(
                oid=oid,
                repo_name=self.repo_name,
                repo_owner=self.repo_owner,
                name=name,
                email=email,
                is_author=is_author,
                is_tagger=is_tagger
            ).process()

            if user is None:
                self._dump_anon_user_object(name=name, email=email, object_=self.db_schema.contributors.insert(),
                                            locked_insert=LOCKED)
            else:
                self._dump_user_object(login=None, user_object=user, object_=self.db_schema.contributors.insert(),
                                       locked_insert=LOCKED)

            return self.__get_user_id(name=name, email=email, oid=oid, is_author=is_author, is_tagger=is_tagger)
        else:
            if name == res[2]:
                return res[0]
            elif name == res[1]:
                return res[0]
            else:
                self.__update_contributor(name=name, id_=res[0], login=res[1], email=email)
                return res[0]

    def _dump_code_change(self, oid):
        commit = self.repo.get(oid)
        commit_id = self.__get_commit_id(oid)

        logger.debug(f"Dumping Code Change for commit_id -> {commit_id}...")

        code_change = []

        if commit:
            if not commit.parents:
                diffs = [self.repo.diff("4b825dc642cb6eb9a060e54bf8d69288fbee4904", commit)]
            else:
                diffs = [self.repo.diff(i, commit) for i in commit.parents]

            total_diffs = len(diffs)
            for diff in diffs:
                logger.debug(f"Remaining: {total_diffs}")
                total_diffs -= 1
                for patch in diff:
                    obj = self.db_schema.code_change_object(
                        repo_id=self.repo_id,
                        commit_id=commit_id,
                        filename=patch.delta.new_file.path,
                        additions=patch.line_stats[1],
                        deletions=patch.line_stats[2],
                        changes=patch.line_stats[1] + patch.line_stats[2],
                        change_type=self.__get_status(patch.delta.status)
                    )

                    code_change.append(obj)

            self._insert(object_=self.db_schema.code_change.insert(), param=code_change)
            logger.debug(f"Successfully dumped code change for {oid}!")

    def __get_code_change_id(self, commit_id, filename):
        try:
            return self.code_change_map[self.Code_Change(commit_id=commit_id, filename=filename)]
        except KeyError:
            return Exception(f"GitMiner => __get_code_change_id: Object does not exist! commit_id={commit_id}, "
                             f"filename:{filename}")

    def _dump_patches(self, oid):
        commit = self.repo.get(oid)
        commit_id = self.__get_commit_id(oid)

        logger.debug(f"Dumping Patch for commit_id -> {commit_id}...")

        patches = []

        if not commit.parents:
            diffs = [self.repo.diff("4b825dc642cb6eb9a060e54bf8d69288fbee4904", commit)]
        else:
            diffs = [self.repo.diff(i, commit) for i in commit.parents]

        total_diffs = len(diffs)
        for diff in diffs:
            logger.debug(f"Remaining: {total_diffs}")
            total_diffs -= 1
            for patch in diff:
                obj = self.db_schema.patches_object(
                    code_change_id=self.__get_code_change_id(commit_id, patch.delta.new_file.path),
                    patch=patch.patch
                )

                patches.append(obj)

        self._insert(object_=self.db_schema.patches.insert(), param=patches)
        logger.debug(f"Successfully dumped patch for {oid}!")

    def _dump_commit(self, oid):
        logger.debug(f"Inserting for commit: {oid}...")
        commit = self.repo.get(oid)

        if not commit.parents:
            diffs = [self.repo.diff("4b825dc642cb6eb9a060e54bf8d69288fbee4904", commit)]
        else:
            diffs = [self.repo.diff(i, commit) for i in commit.parents]

        num_files_changed = 0
        additions, deletions = 0, 0
        for diff in diffs:
            num_files_changed += diff.stats.files_changed
            additions += diff.stats.insertions
            deletions += diff.stats.deletions

        author_name = commit.author.name
        author_email = commit.author.email
        author_id = self.__get_user_id(name=author_name, email=author_email, oid=oid.hex, is_author=True,
                                       is_tagger=False) if \
            author_email.strip() else None
        authored_date = datetime.fromtimestamp(commit.author.time)

        committer_name = commit.committer.name
        committer_email = commit.committer.email

        if committer_email == "*****@*****.**":
            committer_id = author_id
        else:
            committer_id = self.__get_user_id(name=committer_name, email=committer_email, oid=oid.hex,
                                              is_author=False, is_tagger=False) if committer_email.strip() else None

        committed_date = datetime.fromtimestamp(commit.commit_time)

        message = commit.message

        if len(commit.parents) > 1:
            is_merge = 1
        else:
            is_merge = 0

        obj = self.db_schema.commits_object(
            repo_id=self.repo_id,
            oid=oid.hex,
            additions=additions,
            deletions=deletions,
            author_id=author_id,
            authored_date=authored_date,
            committer_id=committer_id,
            committer_date=committed_date,
            message=message,
            num_files_changed=num_files_changed,
            is_merge=is_merge
        )

        self._insert(object_=self.db_schema.commits.insert(), param=obj)
        logger.debug(f"Successfully dumped commit: {oid.hex}")

    def __fetch_branch_commits(self, branch_target):
        logger.info(f"Ongoing Branch {branch_target[0]}...")

        for commit in self.repo.walk(branch_target[1], GIT_SORT_TOPOLOGICAL | GIT_SORT_TIME):
            if commit.oid not in self.commits:
                self.commits.add(commit.oid)
            else:
                break

    def _fetch_commit_ids(self):
        try:
            with open(f"{ROOT}/.gras-cache/{self.repo_name}_commits.txt", "rb") as fp:
                self.commits = pickle.load(fp)

            self.commits = [Oid(hex=x) for x in self.commits]

            logger.info(f"TOTAL COMMITS: {len(self.commits)}")
            return self.commits
        except FileNotFoundError:
            logger.error("Commits file not present, dumping...")

        self.commits = set()
        with concurrent.futures.ThreadPoolExecutor(max_workers=THREADS) as executor:
            process = {executor.submit(self.__fetch_branch_commits, branch_target): branch_target for branch_target
                       in self.branches.items()}

            for future in concurrent.futures.as_completed(process):
                branch_target = process[future]
                logger.info(f"Fetched for {branch_target[0]}, Total: {len(self.commits)}")

        logger.info(f"TOTAL COMMITS: {len(self.commits)}")
        with open(f"{ROOT}/.gras-cache/{self.repo_name}_commits.txt", "wb") as fp:
            temp = [x.hex for x in self.commits]
            pickle.dump(temp, fp)
            del temp

    @timing(name="commits", is_stage=True)
    def _parse_commits(self):
        res = self.execute_query(
            f"""
            SELECT DISTINCT oid
            FROM commits
            """
        ).fetchall()

        dumped_commits = [x[0] for x in res]
        del res

        commits = list(self.commits)
        for i in range(0, len(commits), THREADS):
            proc = [mp.Process(target=self._dump_commit, args=(oid,)) for oid in commits[i:i + THREADS] if
                    oid.hex not in dumped_commits]
            for p in proc:
                p.start()

            while any([p.is_alive() for p in proc]):
                continue

    @timing(name="code change", is_stage=True)
    def _parse_code_change(self):
        id_oid = self.execute_query(
            f"""
            SELECT id, oid
            FROM commits
            """
        ).fetchall()

        dumped_ids = self.execute_query(
            f"""
            SELECT DISTINCT commit_id
            FROM code_change       
            """
        ).fetchall()

        dumped_ids = [x[0] for x in dumped_ids]

        not_dumped_commits = [x[1] for x in id_oid if x[0] not in dumped_ids]
        del dumped_ids
        del id_oid

        for i in range(0, len(not_dumped_commits), THREADS):
            proc = [mp.Process(target=self._dump_code_change, args=(oid,)) for oid in
                    not_dumped_commits[i: i + THREADS]]
            for p in proc:
                p.start()

            while any([x.is_alive() for x in proc]):
                continue

    @timing(name="patches", is_stage=True)
    def _parse_patches(self):
        self.__init_commits(inverse=True)

        res = self.execute_query(
            f"""
            SELECT id, commit_id
            FROM code_change
            """
        ).fetchall()

        cc_commit = {}
        for row in res:
            cc_commit[row[0]] = row[1]

        res = self.execute_query(
            """
            SELECT code_change_id
            FROM patches
            """
        )

        not_dumped_commits = set(cc_commit.values()).difference({cc_commit[x[0]] for x in res})
        not_dumped_commits = sorted([self.id_commit[id_] for id_ in not_dumped_commits])

        del cc_commit

        for i in range(0, len(not_dumped_commits), THREADS):
            proc = [mp.Process(target=self._dump_code_change, args=(oid,)) for oid in
                    not_dumped_commits[i: i + THREADS]]
            for p in proc:
                p.start()

            while any([x.is_alive() for x in proc]):
                continue

    @timing(name="async -> commits", is_stage=True)
    async def _async_parse_commits(self):
        loop = asyncio.get_event_loop()
        tasks = [loop.run_in_executor(self.executor, self._dump_commit, oid) for oid in self.commits]
        completed, _ = await asyncio.wait(tasks)
        for t in completed:
            logger.info(f"Dumped commit: {t.result()}")

    @timing(name="async -> code change", is_stage=True)
    async def _async_parse_code_change(self):
        loop = asyncio.get_event_loop()
        tasks = [loop.run_in_executor(self.executor, self._dump_code_change, oid) for oid in self.commits]
        completed, _ = await asyncio.wait(tasks)
        for t in completed:
            logger.info(f"Dumped Code Change for commit: {t.result()}")

    def process(self):
        if self.aio:
            self.loop.run_until_complete(self._parse_commits())
            self.loop.run_until_complete(self._parse_code_change())
        else:
            # self._parse_commits()
            self.__init_commits()
            self._parse_code_change()

        # self._parse_patches()

    def __del__(self):
        if self.aio:
            self.loop.close()
Esempio n. 14
0
#    print commit.commit_time
#    print commit.commit_time_offset

git = sh.git.bake(_cwd='/home/heather/research/spoon-knife')

for point in history:
    git.checkout(point)
#    print subprocess.check_output(['ohcount', 'spoon-knife'])

git.checkout(history[0])

i = 0
while i < len(history) - 2:
    t0 = base.revparse_single(history[i])
    t1 = base.revparse_single(history[i+1])
    diff = base.diff(t0,t1)
    patches = [p for p in diff]
    for patch in patches:
        print 'NUM HUNKS: ' + str(len(patch.hunks))
        for hunk in patch.hunks:
#            print hunk.lines
            totesLines = 0
            totesMods = 0
            for line in hunk.lines:
                totesLines += 1
                if line[0] == '-' or line[0] == '+':
                    totesMods += 1
                    print line
            print 'TOTAL LINES: ' + str(totesLines)
            print 'TOTAL MODS: ' + str(totesMods)
#            print 'OLD: ' + str(hunk.old_lines)
Esempio n. 15
0
class GitInfo:
    '''
    Call .current_oid_hex to save commit-oid as str.
    '''
    def __init__(self, base_path=None):
        self.base_path = None
        self.repo = None
        self.dirty = None
        self.ignored = None
        self.status = None
        self.diff = None
        self.head_name = None
        self.current_commit = None
        self.current_oid_hex = None

        if base_path is not None:
            self.open(base_path)

    def open(self, base_path):
        self.base_path = base_path
        repo_path = discover_repository(self.base_path)
        self.repo = Repository(repo_path)

        try:
            self.repo.head.target
        except Exception as e:
            raise Exception('repo.head not found! No commit in repo.')

    def collect(self):
        '''Compare worktree to `HEAD`.
            Show difference, and HEAD commit info.
            No branches check.
        '''
        if self.repo is None:
            repo_path = discover_repository(self.base_path)
            self.repo = Repository(repo_path)

        # diff

        # [patch,patch,...]
        # patch.text :str
        self.diff = list(self.repo.diff('HEAD'))

        # status

        self.status = self.repo.status()
        ss = self.status
        stt_p = []
        stt_p_ig = []
        stt_f = []
        for filepath, flag in ss.items():
            flag = statusFlag(flag)
            if 'GIT_STATUS_IGNORED' in flag:
                stt_p_ig.append(filepath)
            else:
                stt_p.append(filepath)
                stt_f.append(flag)

        self.dirty = [(p, f) for p, f in zip(stt_p, stt_f)]
        self.ignored = stt_p_ig

        # branch,ref

        self.head_name = self.repo.head.name

        # commit

        # commit attrs:
        #     author,commiter: Signature:  .name .email
        #     commit_time # Unixtimestamp
        self.current_commit = self.repo.head.peel()
        self.current_oid_hex = self.current_commit.oid.hex

    def report(self):
        self.collect()
        data = {}

        #diff = [patch.text for patch in self.diff]
        #data['diff']= diff

        status = {'dirty': self.dirty, 'ignored': self.ignored}

        data['status'] = status

        ref = self.head_name
        data['head'] = ref

        c = self.current_commit
        t = c.commit_time
        utc_str = datetime.utcfromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S')

        commit = {
            'message': c.message,
            'oid': c.oid,
            'time': utc_str,
            'author': (c.author.name, c.author.email),
            'committer': (c.committer.name, c.author.email),
        }

        data['commit'] = commit

        return data

    def commit(self, msg):
        return commitAll(self.repo, msg, is_first=False)

    def __str__(self):
        return str(self.report())
Esempio n. 16
0
class PygitHelper:
    def __init__(self):
        """ Initialize repository object to current working directory
    
        """
        self.repo = Repository(os.getcwd())

    def get_diff(self, id_a, id_b):
        """ Get different lines of a staged file and the its commit_depth-commit version.

        Returns:
            files: Mapping from old lines numbers to new line numbers
        """
        patch = self.repo[id_b].diff(self.repo[id_a])
        for hunk in patch.hunks:
            for line in hunk.lines:
                yield line.new_lineno, line.old_lineno

    def get_files(self, rev_a=0, rev_b=None):
        """ Get all files from commit_depth-commit.

        Args:
            rev_a: Index to start from
            rev_b: Index to go to

        Returns:
            new_files: Files from new commit specified above.
            old_files: Files from old commit specified above.
        """
        if not rev_b:
            deltas = self.repo.diff("HEAD~" + str(rev_a), cached=True).deltas
        else:
            # Iterate over all commits
            deltas = self.repo.diff("HEAD~" + str(rev_a),
                                    "HEAD~" + str(rev_b)).deltas

        o_files = []
        n_files = []
        for delta in deltas:
            n_files.append(delta.new_file)
            o_files.append(delta.old_file)

        return n_files, o_files

    def get_files_tuple(self, rev_a=0, rev_b=None):
        """ Get all files from commit_depth-commit.

        Args:
            rev_a: Index to start from
            rev_b: Index to go to

        Returns:
            files: List of tuple containing new and old files
        """
        if not rev_b:
            deltas = self.repo.diff("HEAD~" + str(rev_a), cached=True).deltas
        else:
            # Iterate over all commits
            deltas = self.repo.diff("HEAD~" + str(rev_a),
                                    "HEAD~" + str(rev_b)).deltas

        files = []
        for delta in deltas:
            files.append((delta.new_file, delta.old_file))

        return files

    def get_remote(self):
        """ Get configured remotes

        Returns:
            remotes: All configured remotes as dictionary containing name and
                url
        """

        remotes = {}

        for rep in self.repo.remotes:
            remotes[rep.name] = rep.url

        return remotes

    def get_branches(self):
        """ Get all branches of the current repository

        Returns: 
            branches_local: All local branches
            branches_remote: All remote branches
        """

        branches_local = list(self.repo.branches.local)
        branches_remote = list(self.repo.branches.remote)

        return branches_local, branches_remote

    def get_current_branch(self):
        """ Get current branch

        Returns:
            current_branch: Name of the branch currently checked out
        """
        current_branch = (self.repo.head.name).rsplit("/", 1)[1]

        return current_branch

    def get_commit_history(self, length=10):
        """ Get commit history

        Returns:
            commits: List of pygit2.Commit objects containing information about
                the commit
        """
        commits = []

        for commit in self.repo.walk(self.repo.head.target, GIT_SORT_TIME):
            if length == 0:
                return commits

            commits.append(commit)
            length -= 1

        return commits
Esempio n. 17
0
class prototype:
    repo = ""  # Path to a given repository
    name = ""  # Name of a repository
    base = ""  # Repository as defined in pygit2

    # Initialization. Clones the given repository, placing it in the current directory,
    # and changes to the repository directory.
    def init(self, repository):
        self.repo = repository

        # Use regular expressions to match the last instance of a forward slash
        # followed by the name of the repository, which we wish to extract, followed
        # by ".git". 
        m = re.search('/([^/]+).git$', repository)
        if m:
            self.name = m.group(1)

        if not os.path.isdir(self.name):
            os.system('git clone ' + self.repo) # Get the repository from GitHub

        self.base = Repository(self.name)
        self.base.checkout('HEAD')

    # Destruction. Remove the given repository from memory.
    def destroy(self):
        os.system('cd ' + self.name)
        os.system('rm -rf ' + self.name)

    # Get total LOC by given repository. 
    def totalRepoLOC(self):
        loc = countDirLOC(self.name)
        return loc

    # Get total commits by a given repository
    def totalRepoCommits(self):
        commits = 0
        for commit in self.base.walk(self.base.head.target, GIT_SORT_TOPOLOGICAL):
            commits = commits + 1
        return commits

    # Get a list of LOC changed per commit
    def locPerCommit(self):
        loc = []
        oldPath = os.popen('pwd')
        os.chdir(self.name)
        sha1 = 0
        sha2 = 0

        start = 1
        total = self.totalRepoCommits()

        # For each commit within the repository
        for commit in self.base.walk(self.base.head.target, GIT_SORT_TOPOLOGICAL):

            print '\r', start, '/', total,
            start += 1

            # Based on the SHA, use git to show the patch for that commit
            sha1 = sha2
            sha2 = commit.hex
            if sha1 != 0:
                p = os.popen('git diff --shortstat ' + sha1 + ' ' + sha2)
                line = p.readline()

                # line contains "# file changed, # insertions(+), # deletions(-)
                # Use regular expressions to find the number of additions and deletions.
                # Additions are found after ", " and before " insertion". Deletions are
                # found after "(+), " and before " deletion".
                m = re.search(', (.*) insertion', line)
                additions = 0
                deletions = 0
                if m:
                    additions = m.group(1)
                m = re.search('\(\+\), (.*) deletion', line)
                if m:
                    deletions = m.group(1)

                # Get the total and append to array
                modifications = int(additions) + int(deletions)
                loc.append(modifications)

        os.chdir('..')
        return loc


    # Get a list containing the total number of line additions and deletions (including
    # whitespace and comments) contained within each hunk that was changed over t
    def locPerHunk(self):
        loc = []
        history = []

        # Get the hex number for each commit within the repository
        for commit in self.base.walk(self.base.head.target, GIT_SORT_TOPOLOGICAL):
            sha = commit.hex
            history.append(sha)

        # Compare each revision in the history of the repository with the previous rev.
        i = 0
        while i < len(history) - 1:
            t0 = self.base.revparse_single(history[i])
            t1 = self.base.revparse_single(history[i+1])
            diff = self.base.diff(t0,t1)
            patches = [p for p in diff]
            for patch in patches:
                for hunk in patch.hunks:
                   
                    # Check the first character in each hunk line. Only those that have
                    # been modified will contain a '+' (insertion) or '-' (deletion)
                    totalModifications = 0
                    for line in hunk.lines:
                        if line[0] == '-' or line[0] == '+':
                            totalModifications +=1
                    loc.append(totalModifications)
            i += 1
        return loc

    # Get the total number of lines contained within a hunk, including additions, deletions,
    # and surrounding non-changed lines
    def locInHunk(self):
        loc = []
        history = []

        # Get the hex number for each commit within the repository
        for commit in self.base.walk(self.base.head.target, GIT_SORT_TOPOLOGICAL):
            sha = commit.hex
            history.append(sha)

        # Compare each revision in the history of the repository with the previous rev.
        i = 0
        while i < len(history) - 1:
            t0 = self.base.revparse_single(history[i])
            t1 = self.base.revparse_single(history[i+1])
            diff = self.base.diff(t0,t1)
            patches = [p for p in diff]
            for patch in patches:
                for hunk in patch.hunks:
                    totalLines = 0
                    for line in hunk.lines:
                       totalLines += 1
                    loc.append(totalLines)
            i += 1
        return loc

    # Perform a diff between all commits starting from oldest to newest
    #  and compile temp files comprised of only modified lines.
    #  Run cloc on temp files to get sloc for each diff set.
    def slocPerDiff(self):
        # Storage for commit history hashes
        history = []
        
        # Store all slocs
        slocPerDiffs = []

        # Move through the system history from newest to oldest commit
        for commit in self.base.walk(self.base.head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE):
            history.append(commit)

        i = 0
        while i < len(history) - 2:
            sloc = 0
            t0 = self.base.revparse_single(history[i].hex)
            t1 = self.base.revparse_single(history[i+1].hex)
            try:
                diff = self.base.diff(t0,t1)
            except ValueError:
                print "Caught value error."
                i += 1
                continue

            patches = [p for p in diff]
            for patch in patches:
                print patch.new_file_path
                hunkfile = open("tmp", 'w') 
                for hunk in patch.hunks:
                    totesLines = 0
                    totesMods = 0
                    for line in hunk.lines:
                        totesLines += 1
                        if line[0] == '-' or line[0] == '+':
                            totesMods += 1
                            hunkfile.write(line[1])
                hunkfile.close()
            
                output = subprocess.Popen('cloc ' + patch.new_file_path + ' --by-file --csv', shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
                start = False
                for line in output.stdout.readlines():
                    if line[0] == 'l':
                        start = True
                        continue
                    if start:
                        temp = line.split(',')
                        sloc += int(temp[4].replace('\n', ''))
                        retval = output.wait()
                os.remove("tmp")                        
            i += 1
            slocPerDiffs.append(int(sloc))
        
        return slocPerDiffs

    # Get a list containing the number of hunks changed per commit
    def hunksPerCommit(self):
        hunks = []
        history = []

        start = 1
        total = self.totalRepoCommits()

        # Get the hex number for each commit within the repository
        for commit in self.base.walk(self.base.head.target, GIT_SORT_TOPOLOGICAL):
            sha = commit.hex
            history.append(sha)

        # Compare each revision in the history of the repository with the previous rev.
        i = 0
        while i < len(history) - 1:
            print '\r', start, '/', total,
            start += 1

            t0 = self.base.revparse_single(history[i])
            t1 = self.base.revparse_single(history[i+1])

            try:
                diff = self.base.diff(t0,t1)
            except ValueError:
                print "Caught value error."
                i += 1
                continue

            patches = [p for p in diff]
            for patch in patches:
                hunks.append(len(patch.hunks))
            i += 1

        return hunks


    # Get a list of the number of files changed per commit
    def filesPerCommit(self):
        files = []
        oldPath = os.popen('pwd')
        os.chdir(self.name)
        sha1 = 0
        sha2 = 0

        start = 1
        total = self.totalRepoCommits()

        # For each commit within the repository
        for commit in self.base.walk(self.base.head.target, GIT_SORT_TOPOLOGICAL):

            print '\r', start, '/', total,
            start += 1

            # Based on the SHA, use git to show the patch for that commit
            sha1 = sha2
            sha2 = commit.hex
            if sha1 != 0:
                p = os.popen('git diff --shortstat ' + sha1 + ' ' + sha2)
                line = p.readline()

                # line contains "# file changed, # insertions(+), # deletions(-)
                # Use regular expressions to find the number of files modified, which
                # are contained first on the line followed by " file"
                m = re.search(' (.*) file', line)
                if m:
                    numFilesChanged = int(m.group(1))
                    files.append(numFilesChanged)

        os.chdir('..')
        return files

    # Print out all stats for the repository
    def printStats(self):
        f = open(self.name + '-results.txt', 'w')
        f.write(("-----------" + self.name + "-----------\n"))

        # Stats on entire repository
        repoLOC = self.totalRepoLOC()
        repoCommits = self.totalRepoCommits()

        # Lists by commit
        locPerCommit   = self.locPerCommit()
        #slocPerDiff    = self.slocPerDiff()
        hunksPerCommit = self.hunksPerCommit()
        filesPerCommit = self.filesPerCommit()
        
        # Stats for LOC
        xsmall = 0
        small  = 0
        medium = 0
        large  = 0
        xlarge = 0
        for item in locPerCommit:
            if (item >= 0 and item <= 5):
                xsmall += 1
            if (item >= 6 and item <= 46):
                small += 1
            if (item >= 47 and item <= 106):
                medium += 1
            if (item >= 107 and item <= 166):
                large += 1
            if (item >= 167):
                xlarge += 1

        f.write("Number of Modified Lines:\n")
        f.write("x-small: " + str(xsmall) + "\n")
        f.write("small:   " + str(small) + "\n")
        f.write("medium:  " + str(medium) + "\n")
        f.write("large:   " + str(large) + "\n")
        f.write("x-large: " + str(xlarge) + "\n")
        

        '''
        # Stats for SLOC
        xsmall = 0
        small  = 0
        medium = 0
        large  = 0
        xlarge = 0
        for item in slocPerDiff:
            if (item >= 0 and item <= 5):
                xsmall += 1
            if (item >= 6 and item <= 46):
                small += 1
            if (item >= 47 and item <= 106):
                medium += 1
            if (item >= 107 and item <= 166):
                large += 1
            if (item >= 167):
                xlarge += 1

        f.write("Number of Modified SLOC: \n")
        f.write("x-small: " + str(xsmall) + "\n")
        f.write("small:   " + str(small) + "\n")
        f.write("medium:  " + str(medium) + "\n")
        f.write("large:   " + str(large) + "\n")
        f.write("x-large: " + str(xlarge) + "\n")

        '''
        # Print stats for modified files
        xsmall = 0
        small  = 0
        medium = 0
        large  = 0
        xlarge = 0
        for item in filesPerCommit:
            if (item == 1):
                xsmall += 1
            if (item >= 2 and item <= 4):
                small += 1
            if (item >= 5 and item <= 7):
                medium += 1
            if (item >= 8 and item <= 10):
                large += 1
            if (item >= 11):
                xlarge += 1

        f.write("Number of modified files: \n")
        f.write("x-small: " + str(xsmall) + "\n")
        f.write("small:   " + str(small) + "\n")
        f.write("medium:  " + str(medium) + "\n")
        f.write("large:   " + str(large) + "\n")
        f.write("x-large: " + str(xlarge) + "\n")

        # Prints stats for hunks
        xsmall = 0
        small  = 0
        medium = 0
        large  = 0
        xlarge = 0
        for item in hunksPerCommit:
            if (item >= 0 and item <= 1):
                xsmall += 1
            if (item >= 2 and item <= 8):
                small += 1
            if (item >= 9 and item <= 17):
                medium += 1
            if (item >= 18 and item <= 26):
                large += 1
            if (item >= 27):
                xlarge += 1

        f.write("Number of hunks per commit: \n")
        f.write("x-small: " + str(xsmall) + "\n")
        f.write("small:   " + str(small) + "\n")
        f.write("medium:  " + str(medium) + "\n")
        f.write("large:   " + str(large) + "\n")
        f.write("x-large: " + str(xlarge) + "\n")

        f.close()
Esempio n. 18
0
def get_and_update_repo_cache(repo_path, repo_name):
    cache_filename = '%s-stats.cache' % repo_name
    if os.path.exists(cache_filename):
        with open(cache_filename) as f:
            data = load(f)
    else:
        data = {
            'author_to_month_to_additions': defaultdict(defaultdict_int),
            'author_to_month_to_deletions': defaultdict(defaultdict_int),
            'author_to_month_to_changes': defaultdict(defaultdict_int),
            'author_to_month_to_commits': defaultdict(defaultdict_int),
            'day_to_count': defaultdict(defaultdict_int),
            'change_count_by_file': defaultdict(int),
            'latest_sha': None,
        }

    repo = Repository(repo_path)

    ignored_commits = []

    count = 0
    for commit in repo.walk(repo.head.target, GIT_SORT_TOPOLOGICAL):
        count += 1
        if commit.type == GIT_OBJ_COMMIT:
            if data['latest_sha'] == commit.hex:
                break

            try:
                d = repo.diff('%s^' % commit.hex, commit)
            except KeyError:
                print "Commits without parent: ", commit.hex
                continue
            additions = d.stats.insertions
            deletions = d.stats.deletions

            author = author_aliases.get(commit.author.email, commit.author.email)

            day = date.fromtimestamp(commit.commit_time)
            data['day_to_count']['Lines'][day] += additions
            data['day_to_count']['Lines'][day] -= deletions

            if additions > 1000 and deletions < 5 and commit.hex not in whitelist_commits:
                if commit.hex not in blacklist_commits:
                    ignored_commits.append(commit.hex)
                    # print 'WARNING: ignored %s looks like an embedding of a lib (message: %s)' % (commit.hex, commit.message)
                continue
            if (additions > 3000 or deletions > 3000) and commit.hex not in whitelist_commits:
                if commit.hex not in blacklist_commits:
                    ignored_commits.append(commit.hex)
                    # print 'WARNING: ignored %s because it is bigger than 3k lines. Put this commit in the whitelist or the blacklist (message: %s)' % (commit.hex, commit.message)
                continue
            month = date(day.year, day.month, 1)
            data['author_to_month_to_additions'][author][month] += additions
            data['author_to_month_to_deletions'][author][month] += deletions
            data['author_to_month_to_changes'][author][month] += additions + deletions
            data['author_to_month_to_commits'][author][month] += 1
            if data['latest_sha'] is None:
                data['latest_sha'] = commit.hex

            if d.patch:
                for changed_path in [x for x in d.patch.split('\n') if x.startswith('+++ ') and '/dev/null' not in x]:
                    data['change_count_by_file'][changed_path[len('+++ ') + 1:]] += 1

    with open(cache_filename, 'w') as f:
        dump(data, f)

    with open(repo_name + '-ignored-commits.txt', 'w') as f:
        f.writelines('%s\n' % x for x in ignored_commits)

    return data
Esempio n. 19
0
class GitRepo(object):

    ''' git repo class '''

    def __init__(self, path):
        try:
            self.__repo = Repository(path)
        except Exception as e:
            self.__repo = None
            print(e)

    def get_info(self):
        if not self.__repo:
            return None
        signature = self.__repo.default_signature
        result = {
            'path': self.__repo.path,
            'workdir': self.__repo.workdir,
            'bare': self.__repo.is_bare,
            'empty': self.__repo.is_empty,
            'name': signature.name,
            'email': signature.email,
            'time': signature.time,
            'offset': signature.offset,
        }
        return result

    def get_all_references(self):
        return self.__repo.listall_references()

    def get_reference(self, name):
        if not self.__repo:
            return None
        ref = None
        try:
            ref = self.__repo.lookup_reference(name)
        except Exception as e:
            print(e)
        return ref

    def get_all_branches(self, branch_type=None):
        if not self.__repo:
            return None
        if branch_type:
            return self.__repo.listall_branches(branch_type)
        r = self.__repo.listall_branches(GIT_BRANCH_LOCAL | GIT_BRANCH_REMOTE)
        return r

    def get_branch(self, name, branch_type=GIT_BRANCH_LOCAL):
        if not self.__repo:
            return None
        return self.__repo.lookup_branch(name, branch_type)

    def check_branch(self, name, branch_type=None):
        if not branch_type:
            if '/' in name:
                branch_type = GIT_BRANCH_REMOTE
            else:
                branch_type = GIT_BRANCH_LOCAL
        try:
            result = self.get_branch(name, branch_type)
            return result
        except Exception as e:
            print(e)
            return False

    def get_current_commit(self):
        if not self.__repo:
            return None
        commit = self.__repo.revparse_single('HEAD')
        return self.get_commit(commit)

    def get_commit_by_branch(self, branch):
        if not self.__repo:
            return None
        query = 'refs/'
        if hasattr(branch, 'remote_name'):
            query += 'remotes/'
        else:
            query += 'heads/'
        query += branch.branch_name
        try:
            ref = self.get_reference(query)
            commit = ref.target
            return self.get_commit(commit)
        except Exception as e:
            print(e)
            return None

    def get_commit_by_tag(self, tag):
        if self.__repo is None:
            return None
        if tag:
            commit = tag.get_object()
            return self.get_commit(commit)
        return None

    def get_commit(self, oid_or_commit):
        ''' return a commit w/ json '''
        if not self.__repo or not oid_or_commit:
            return None
        try:
            commit = oid_or_commit
            if not isinstance(oid_or_commit, Commit):
                commit = self.__repo.get(oid_or_commit)
            if commit and commit.type == GIT_OBJ_COMMIT:
                # t1 = self.__repo.revparse_single('HEAD^')
                # t2 = self.__repo.revparse_single('HEAD^^')
                # patches = self.__repo.diff(t1, t2)
                # for p in patches:
                #     print(p.new_file_path)
                result = {
                    'id': str(commit.id),
                    'author': commit.author.name,
                    'commiter': commit.committer.name,
                    'message': commit.message,
                    'message_encoding': commit.message_encoding,
                    'tree': str(commit.tree_id),
                    'parent': [str(pid) for pid in commit.parent_ids],
                    'time': str(commit.commit_time),
                    'time_offset': str(commit.commit_time_offset),
                }
                return result
        except Exception as e:
            print(e)
        return None

    def get_commits(self, depth=10, oid_or_commit=None):
        result = []
        if depth == 0:
            return result
        if oid_or_commit:
            commit = self.get_commit(oid_or_commit)
        else:
            commit = self.get_current_commit()
        if not commit:
            return result
        # TODO: starting from a commit or its parent
        # TODO: author
        result.append(commit)
        depth -= 1
        if commit and commit['parent']:
            for parent in commit['parent']:
                    result.extend(self.get_commits(depth, parent))
        return result

    def get_commits_by_branch(self, name, path=None):
        if not self.__repo:
            return None
        if self.check_branch(name):
            ref = self.get_reference('refs/heads/' + name)
            if ref:
                commit = ref.target
                commits = self.get_commits(commit)
                result = {}
                for key, val in commits.items():
                    if self.check_commit_by_path(val, path):
                        result[key] = val
                return result
        return None

    def check_tag(self, name):
        try:
            ref = self.get_reference('refs/tags/' + name)
            return ref
        except Exception:
            return False

    def get_commits_by_tag(self, tag, path=None):
        if not self.__repo:
            return None
        if tag:
            commit = tag.target
            commits = self.get_commits(commit)
            result = {}
            for key, val in commits.items():
                if self.check_commit_by_path(val, path):
                    result[key] = val
            return result
        return None

    def check_commit_by_path(self, commit, path):
        if not commit:
            return False
        if path is None or len(path) == 0:
            return True
        result = self.get_tree(commit['tree'])

        if not isinstance(path, list):
            path = path.strip().split('/')

        for name in path:
            name = name.strip()
            if name in result:
                oid = result[name]
                result = self.get_tree(oid)

                if not result:
                    result = self.get_blob(oid)
        return result is not None

    def get_tree(self, oid, ppath=None):
        if not self.__repo:
            return None
        try:
            tree = self.__repo.get(oid)
            if tree and tree.type == GIT_OBJ_TREE:
                result = {}
                for entry in tree:
                    item = {
                        'id': str(entry.id)
                    }
                    obj = self.__repo.get(entry.id)
                    if obj.type == GIT_OBJ_BLOB:
                        item['type'] = 'blob'
                    elif obj.type == GIT_OBJ_TREE:
                        item['type'] = 'tree'
                    item['ppath'] = ppath
                    result[entry.name] = item
                return result
        except Exception as e:
            print(e)
        return None

    def get_tree_by_commit(self, commit, path=None):
        if not commit:
            return None
        result = self.get_tree(commit['tree'])
        if not path:
            return result

        # if not isinstance(path, list):
        #     path = path.strip().split('/')

        try:
            for name in path:
                oid = result[name]['id']
                p = result[name]['ppath']
                p = name if not p else p + '/' + name
                result = self.get_tree(oid, p)
                if not result:
                    break
        except Exception as e:
            print(e)
            result = None
        return result

    def get_current_root(self):
        tree = self.get_current_commit()
        if tree:
            return self.get_tree(tree['tree'])
        return None

    def get_whole_tree(self, oid):
        ''' tree w/ json '''
        if not self.__repo:
            return None
        result = tree_walker(self.__repo, oid)
        return result

    def get_blob(self, oid):
        ''' blob w/ json '''
        if not self.__repo or not oid:
            return None
        try:
            blob = self.__repo.get(oid)
            if blob and blob.type == GIT_OBJ_BLOB:
                content = blob.is_binary and None or blob.data.decode(
                    'utf8', 'ignore')
                result = {
                    'id': str(blob.id),
                    'content': content,
                    'size': blob.size,
                }
                return result
        except Exception as e:
            print(e)
        return None

    def get_blob_by_commit(self, commit, path=None):

        try:
            tree = self.get_tree_by_commit(commit, path[:-1])
            oid = tree[path[-1]]['id']
            result = self.get_blob(oid)
            return result
        except Exception as e:
            print(e)
            return None

    def get_tag(self, oid):
        ''' blob w/ json '''
        if not self.__repo or not oid:
            return None
        try:
            tag = self.__repo.get(oid)
            if tag and tag.type == GIT_OBJ_TAG:
                result = {
                    'id': str(oid),
                    'name': tag.name,
                    'target': str(tag.target.id),
                    'tagger': tag.tagger,
                    'message': tag.message,
                }
                return result
        except Exception as e:
            print(e)
        return None

    def get_patches(self, a=None, b=None):
        try:
            if not a:
                a = 'HEAD'
            if not b:
                b = a + '^'
            t1 = self.__repo.revparse_single(a)
            t2 = self.__repo.revparse_single(b)
            patches = self.__repo.diff(t1, t2)
            result = []
            for patch in patches:
                p = {
                    'old_file_path': patch.old_file_path,
                    'new_file_path': patch.new_file_path,
                    'old_oid': str(patch.old_oid),
                    'new_oid': str(patch.new_oid),
                    'status': patch.status,
                    'similarity': patch.similarity,
                    'additions': patch.additions,
                    'deletions': patch.deletions,
                    'binary': patch.is_binary,
                    'hunks': [],
                }
                for hunk in patch.hunks:
                    h = {
                        'old_start': hunk.old_start,
                        'old_lines': hunk.old_lines,
                        'new_start': hunk.new_start,
                        'new_lines': hunk.new_lines,
                        'lines': hunk.lines,
                    }
                    p['hunks'].append(h)
                result.append(p)
            return result
        except Exception as e:
            print(e)
        return None
Esempio n. 20
0
prev_commit = None

consolidate = True

if consolidate is True:
    matrix = {}
else:
    matrix = []

for commit in repo.walk(repo.head.target):
    print(commit.message)

    if prev_commit is not None:
        # get the diff info
        diff = repo.diff(commit, prev_commit)

        # Get the string with changed info and split it
        changes = diff.patch.split('\n')[5:]
        try:
            a = changes[3].split()
            b = changes[4].split()
        except:
            print("last one? and I am too lasy to dump that one so here we go")
            Exception
        else:
            try:
                person_a = a[0][1:]
                person_b = b[0][1:]
            except:
                print("new person")
Esempio n. 21
0
class GitBlack:
    def __init__(self):
        self.repo = Repository(".")
        self.patchers = {}

    def get_blamed_deltas(self, patch):
        filename = patch.delta.old_file.path
        self.patchers[filename] = Patcher(self.repo, filename)
        hb = HunkBlamer(self.repo, patch)
        return hb.blames()

    def group_blame_deltas(self, blames):
        for delta_blame in blames:
            commits = tuple(sorted(delta_blame.commits))
            self.grouped_deltas.setdefault(commits,
                                           []).append(delta_blame.delta)

        self.progress += 1
        now = time.monotonic()
        if now - self.last_log > 0.04:
            sys.stdout.write("Reading file {}/{} \r".format(
                self.progress, self.total))
            sys.stdout.flush()
            self.last_log = now

    def commit_changes(self):
        start = time.monotonic()
        self.grouped_deltas = {}

        for path, status in self.repo.status().items():
            if status & index_statuses:
                raise GitIndexNotEmpty

        patches = []
        self._file_modes = {}
        diff = self.repo.diff(context_lines=0,
                              flags=GIT_DIFF_IGNORE_SUBMODULES)
        for patch in diff:
            if patch.delta.status != GIT_DELTA_MODIFIED:
                continue
            self._file_modes[
                patch.delta.old_file.path] = patch.delta.old_file.mode
            patches.append(patch)

        self.progress = 0
        self.last_log = 0
        self.total = len(patches)

        executor = ThreadPoolExecutor(max_workers=8)
        tasks = set()
        for patch in patches:
            tasks.add(executor.submit(self.get_blamed_deltas, patch))
            if len(tasks) > 8:
                done, not_done = wait(tasks, return_when=FIRST_COMPLETED)
                for task in done:
                    self.group_blame_deltas(task.result())
                tasks -= set(done)

        for task in tasks:
            self.group_blame_deltas(task.result())

        secs = time.monotonic() - start
        sys.stdout.write("Reading file {}/{} ({:.2f} secs).\n".format(
            self.progress, self.total, secs))

        start = time.monotonic()
        self.total = len(self.grouped_deltas)
        self.progress = 0
        self.last_log = 0

        for commits, deltas in self.grouped_deltas.items():
            blobs = self._create_blobs(deltas)
            self._commit(commits, blobs)

        secs = time.monotonic() - start
        print("Making commit {}/{} ({:.2f} secs).".format(
            self.progress, self.total, secs))

    def _create_blobs(self, deltas):
        filenames = set()
        for delta in deltas:
            self.patchers[delta.filename].apply(delta)
            filenames.add(delta.filename)

        blobs = {}
        for filename in filenames:
            blob_id = self.repo.create_blob(self.patchers[filename].content())
            blobs[filename] = blob_id

        return blobs

    def _commit(self, original_commits, blobs):
        for filename, blob_id in blobs.items():
            file_mode = self._file_modes[filename]
            index_entry = IndexEntry(filename, blob_id, file_mode)
            self.repo.index.add(index_entry)

        commits = [self.repo.get(h) for h in original_commits]

        main_commit = commits[0]
        if len(commits) > 1:
            # most recent commit
            main_commit = sorted(commits, key=commit_datetime)[-1]

        commit_message = main_commit.message
        commit_message += "\n\nautomatic commit by git-black, original commits:\n"
        commit_message += "\n".join(
            ["  {}".format(c) for c in original_commits])

        committer = Signature(
            name=self.repo.config["user.name"],
            email=self.repo.config["user.email"],
        )

        self.repo.index.write()
        tree = self.repo.index.write_tree()
        head = self.repo.head.peel()
        self.repo.create_commit("HEAD", main_commit.author, committer,
                                commit_message, tree, [head.id])
        self.progress += 1
        now = time.monotonic()
        if now - self.last_log > 0.04:
            sys.stdout.write("Making commit {}/{} \r".format(
                self.progress, self.total))
            sys.stdout.flush()
            self.last_log = now
Esempio n. 22
0
class GitMixin(object):

    tag_or_remote_regex = re.compile('^refs/(tags|remotes)/(.*)')

    def __init__(self):
        where = GitOperations.get_repository_location(self.user, self.name)
        self.ondisk = Repository(where)

    def refresh(self):
        creds = GitOperations.get_credentials(self.git_user, self.user)
        for remote in self.ondisk.remotes:
            remote.credentials = creds
            remote.fetch()
        # update current reference
        master_ref = self.ondisk.lookup_reference('refs/heads/master')
        remote_ref = self.ondisk.lookup_reference('refs/remotes/origin/master')
        master_ref.set_target(remote_ref.target)

    def filter_references(self, regex):
        return [ref for ref in self.ondisk.listall_references()
                if regex.match(ref)]

    def get_commit_time(self, name):
        ref = self.ondisk.revparse_single(name)
        if isinstance(ref, Tag):
            return ref.get_object().commit_time
        if isinstance(ref, Commit):
            return ref.commit_time
        raise GitException('invalid reference: commit time could not be found.') # pragma: no cover

    def get_latest_refs(self, count=None):
        info = self.filter_references(GitMixin.tag_or_remote_regex)
        refs = list(zip(info, map(self.get_commit_time, info)))
        refs.sort(key=itemgetter(1), reverse=True)
        def ref_info(info):
            (ref, commit_time) = info
            what, name = GitMixin.tag_or_remote_regex.findall(ref)[0]
            return (what, name, commit_time)
        refs = map(ref_info, refs)
        if not count:
            return refs
        return islice(refs, count)

    def filter_commits(self, flags=0):
        all_commits = self.ondisk.walk(self.ondisk.head.target, flags)
        emails = [ue.email for ue in self.user.emails.all()]
        return filter(lambda commit: commit.author.email in emails, all_commits)

    def get_commits(self, count=None):
        all_commits = self.filter_commits(GIT_SORT_TOPOLOGICAL)
        if not count:
            return all_commits
        return islice(all_commits, count)

    def get_commit_count(self):
        return len(list(self.filter_commits()))

    def get_shorthand_of_branch(self, branch):
        commit = self.ondisk.lookup_branch(branch)
        if commit:
            return commit.shorthand
        return '(none)'

    def get_sha1_of_branch(self, branch):
        commit = self.ondisk.lookup_branch(branch)
        if commit:
            return str(commit.get_object().id)[:6]
        return '(none)'

    def get_numstat(self, commit):
        diff = None
        try:
            previous_commit = self.ondisk.revparse_single(str(commit.id) + '^')
            diff = self.ondisk.diff(previous_commit, commit)
        except KeyError:
            # likely we hit the very first commit.
            diff = commit.tree.diff_to_tree(swap=True)
        additions, deletions = 0, 0
        for patch in diff:
            additions += patch.additions
            deletions += patch.deletions
        return (len(diff), additions, deletions)

    def get_first_updated(self):
        all_commits = self.ondisk.walk(self.ondisk.head.target,
                                       GIT_SORT_TIME | GIT_SORT_REVERSE)
        first_commit = next(all_commits)
        return first_commit.commit_time

    def get_last_updated(self):
        all_commits = self.ondisk.walk(self.ondisk.head.target,
                                       GIT_SORT_TIME)
        last_commit = next(all_commits)
        return last_commit.commit_time

    def get_file_count(self):
        diff = self.ondisk.head.get_object().tree.diff_to_tree()
        return len([patch.old_file_path for patch in diff])

    def get_line_count(self):
        diff = self.ondisk.head.get_object().tree.diff_to_tree()
        return sum([patch.deletions for patch in diff])

    def get_author_count(self):
        commits = self.filter_commits()
        return len(set([commit.author.email for commit in commits]))

    def commits_between(self, start, end):
        all_commits = self.filter_commits(GIT_SORT_TIME | GIT_SORT_REVERSE)
        starting = dropwhile(lambda obj: obj.commit_time < start, all_commits)
        return takewhile(lambda obj: obj.commit_time <= end, starting)

    @staticmethod
    def by_day(obj):
        # we want to group our commit times by the day. so convert
        # timestamp -> date -> timestamp
        new_date = date.fromtimestamp(obj.commit_time)
        new_date += timedelta(days=1)
        return timegm(new_date.timetuple())

    @staticmethod
    def group_by(series):
        result = groupby(series, GitMixin.by_day)
        return [{'date': commit_date,
                 'value': len(list(commits))}
                for commit_date, commits in result]

    def histogram(self, start, end):
        series = self.commits_between(start, end)
        return GitMixin.group_by(series)
def parse_code_churns(pid, repo_path, branch, start, stop=-1):
    """
    Function that is intended to be runned by a process. It extracts the code churns
    for a set of commits and stores them in the RES dict.
    """
    repo = Repository(repo_path)

    head = repo.references.get(branch)
    commits = list(
        repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE))

    start = start - 1 if (start > 0) else start
    commits = commits[start:stop] if (stop != -1) else commits[start:]

    code_churns = [[] for c in range(len(commits))]
    for i, commit in enumerate(tqdm(commits[1:], position=pid)):
        diff = repo.diff(commits[i], commit)

        tree = commit.tree
        patches = [p for p in diff]
        stats = diff.stats

        # Count the total lines of code and find the biggest file that have been changed
        total_tloc = 0
        line_of_code_old = 0
        for patch in patches:
            if patch.delta.is_binary:
                continue
            new_file = patch.delta.new_file

            # Total lines of code
            total_tloc += get_file_lines_of_code(repo, tree, new_file)

            old_file = patch.delta.old_file
            # Total lines of code in the old file
            line_of_code_old = max(
                line_of_code_old, get_file_lines_of_code(repo, tree, old_file))

        # Churned lines of code
        cloc = stats.insertions
        # Deleted lines of code
        dloc = stats.deletions

        # Churned files
        files_churned = len(patches)

        # File count
        num_files = count_files(tree, repo)

        # Apply relative code churns
        measure_one = float(cloc) / total_tloc if (
            total_tloc > 0) else float(cloc)
        measure_two = float(dloc) / total_tloc if (
            total_tloc > 0) else float(cloc)
        measure_three = (float(files_churned) / num_files if
                         (num_files > 0) else float(files_churned))

        line_of_code_old = float(line_of_code_old)

        # Churn features
        code_churns[i].append(str(commit.hex))
        code_churns[i].append(str(measure_one))
        code_churns[i].append(str(measure_two))
        code_churns[i].append(str(measure_three))
        code_churns[i].append(str(line_of_code_old))

    RES[pid] = code_churns