Esempio n. 1
0
def main(repo, out_dir, clobber_output, verbose):
    """  """
    import logging
    from gitpandas import Repository
    if verbose:
        logging.getLogger().setLevel(10)

    if repo.find("git@") == 0:
        logging.info("Cloning repo %s" % repo)
        repository = Repository(working_dir=repo)
        repo = repository.git_dir
        logging.info("Repo located at %s" % repo)

    if out_dir is None:
        out_dir = os.path.join(os.getcwd(), OUT_SUBFOLDER)

    verify_local_repo_location(repo)
    repo_name = os.path.basename(repo)
    make_output_folder(out_dir, overwrite=clobber_output)
    contributor_data = author_minded(repo)
    citation_data = pmc_data('SPSS')
    logging.info("output path: %s" %
                 os.path.join(out_dir, 'contributor_data.json'))
    contributor_data.to_json(os.path.join(out_dir, 'contributor_data.json'),
                             date_format='iso')
    citation_data['citations'].to_json(
        os.path.join(out_dir, 'citation_data.json'))
Esempio n. 2
0
def repository(path):
    # build an example repository object and try some things out
    ignore_dirs = [
        'docs/*',
        'tests/*',
        'Data/*'
    ]
    r = Repository(path)

    # is it bare?
    print('\nRepo bare?')
    print(r.is_bare())
    print('\n')

    # get the commit history
    ch = r.commit_history('HEAD', limit=None, include_globs=['*.py'], ignore_globs=ignore_dirs)
    print(ch.head(5))

    # get the list of committers
    print('\nCommiters:')
    print(''.join([str(x) + '\n' for x in set(ch['committer'].values)]))
    print('\n')

    # print out everyone's contributions
    attr = ch.reindex(columns=['committer', 'lines', 'insertions', 'deletions']).groupby(['committer'])
    attr = attr.agg({
        'lines': np.sum,
        'insertions': np.sum,
        'deletions': np.sum
    })
    print(attr)

    # get the file change history
    fh = r.file_change_history('HEAD', limit=None, ignore_globs=ignore_dirs)
    fh['ext'] = fh['filename'].map(lambda x: x.split('.')[-1])
    print(fh.head(50))

    # print out unique extensions
    print('\nExtensions Found:')
    print(''.join([str(x) + '\n' for x in set(fh['ext'].values)]))
    print('\n')

    # agg by extension
    etns = fh.reindex(columns=['ext', 'insertions', 'deletions']).groupby(['ext'])
    etns = etns.agg({
        'insertions': np.sum,
        'deletions': np.sum
    })
    print(etns)
Esempio n. 3
0
def author_minded(working_dir, frequency=None):
    from numpy import median, min, max, diff, nan, timedelta64
    from pandas import DataFrame
    from gitpandas import Repository
    from itertools import groupby

    if frequency is None:
        frequency = timedelta64(0, 'D')

    repo = Repository(working_dir=working_dir)
    commits = repo.commit_history()
    authors = set(commits.author)

    tot_lines = float(commits.lines.sum())
    result = {
        'first': [],
        'last': [],
        'line_changes': [],
        'commits': [],
        'median_commit_frequency': [],
        'max_dry_stretch': [],
        'max_dayly_commit_run': []
    }
    for author in authors:
        specific = commits[commits.author == author]
        result['first'].append(specific.index.min())
        result['last'].append(specific.index.max())
        result['line_changes'].append(specific.lines.sum() / tot_lines)
        result['commits'].append(len(specific) / float(len(commits)))

        deriv = diff(specific.index[::-1])
        if len(deriv) == 0:
            result['median_commit_frequency'].append(nan)
            result['max_dry_stretch'].append(nan)
            result['max_dayly_commit_run'].append(nan)
        else:
            result['median_commit_frequency'].append(
                median(deriv).astype('timedelta64[D]'))
            result['max_dry_stretch'].append(
                max(deriv).astype('timedelta64[D]'))
            result['max_dayly_commit_run'].append(
                max([
                    len(list(u))
                    for k, u in groupby(deriv.astype('timedelta64[D]'))
                    if k <= frequency
                ]))

    return DataFrame(result, index=authors)
Esempio n. 4
0
def main():
    r = Repository(working_dir=os.path.abspath('../nixpkgs'))
    logger.info('fetching commit history')
    ch = fetch_commit_history(r)
    logger.info('fetching hours estimate')
    he = fetch_hours_estimate(r, ch)
    logger.info('fetching file change history')
    fch = fetch_file_change_history(r)
    logger.info('fetching file change rate')
    fcr = fetch_file_change_rate(r, fch)
    logger.info('fetching cumulative blame')
    cb = fetch_cumulative_blame(r)
    logger.info('fetching bus factor')
    bf = fetch_bus_factor(r)
    logger.info('fetching file owner')
    fo = fetch_file_owner(r)
    logger.info('fetching punch card')
    pc = fetch_punch_card(r)
Esempio n. 5
0
    def setUp(self):
        """

        :return:
        """
        project_dir = str(os.path.dirname(
            os.path.abspath(__file__))) + os.sep + 'repos'
        repo_dir = str(os.path.dirname(os.path.abspath(
            __file__))) + os.sep + 'repos' + os.sep + 'repository1'

        if os.path.exists(project_dir):
            shutil.rmtree(project_dir)

        os.makedirs(project_dir)

        if not os.path.exists(repo_dir):
            os.makedirs(repo_dir)

        # create an empty repo (but not bare)
        grepo = git.Repo.init(repo_dir)

        # add a file
        with open(repo_dir + os.sep + 'README.md', 'w') as f:
            f.write('Sample README for a sample project\n')

        # commit it
        grepo.git.add('README.md')
        grepo.git.commit(m='first commit')

        # now add some other files:
        for idx in range(5):
            with open(repo_dir + os.sep + 'file_%d.py' % (idx, ), 'w') as f:
                f.write('import sys\nimport os\n')

            time.sleep(2.0)
            grepo.git.add(all=True)
            grepo.git.commit(m='adding file_%d.py' % (idx, ))

        self.repo = Repository(working_dir=repo_dir, verbose=True)
Esempio n. 6
0
"""
Assumes that GitPython and pandas are in the same directory as this repo, and nothing else is in that directory.
"""

from gitpandas import Repository

__author__ = 'willmcginnis'

if __name__ == '__main__':
    flask_repo = Repository(working_dir='git://github.com/mitsuhiko/flask.git')

    # do some blaming
    flask_blame = flask_repo.blame(include_globs=['*.py'])

    # figure out committer count from each
    flask_ch = flask_repo.commit_history('master',
                                         limit=None,
                                         include_globs=['*.py'])

    print('\tflask committers: %d' % (len(set(flask_ch['committer'].values))))
    print('\tflask bus count:')
    print(flask_repo.bus_factor(include_globs=['*.py']))
Esempio n. 7
0
from gitpandas import Repository
import time

from definitions import GIT_PANDAS_DIR

__author__ = 'willmcginnis'


if __name__ == '__main__':
    g = Repository(working_dir=GIT_PANDAS_DIR)

    st = time.time()
    blame = g.cumulative_blame(branch='master', include_globs=['*.py', '*.html', '*.sql', '*.md'], limit=None, skip=None)
    print(blame.head())
    print(time.time() - st)

    st = time.time()
    blame = g.parallel_cumulative_blame(branch='master', include_globs=['*.py', '*.html', '*.sql', '*.md'], limit=None, skip=None, workers=4)
    print(blame.head())
    print(time.time() - st)
Esempio n. 8
0
import os
from gitpandas import Repository

__author__ = 'willmcginnis'


if __name__ == '__main__':
    repo = Repository(working_dir=os.path.abspath('../../git-pandas'))
    fc = repo.file_change_rates(include_globs=['*.py'], coverage=True)
    print(fc)
Esempio n. 9
0
 def setUp(self):
     self.repo = Repository(
         working_dir='git://github.com/wdm0006/git-pandas.git',
         verbose=True)
Esempio n. 10
0
from gitpandas import Repository

__author__ = 'willmcginnis'

if __name__ == '__main__':
    repo = Repository(
        working_dir='git://github.com/CamDavidsonPilon/lifelines.git',
        verbose=True)
    shared_blame = repo.blame(extensions=['py'], committer=False, by='file')

    print(shared_blame)
Esempio n. 11
0
from gitpandas import Repository
import numpy as np
import lifelines
import matplotlib.pyplot as plt
plt.style.use('ggplot')

__author__ = 'willmcginnis'

if __name__ == '__main__':
    threshold = 100
    repo = Repository(
        working_dir='git://github.com/scikit-learn/scikit-learn.git',
        verbose=True)
    fch = repo.file_change_history(limit=None, include_globs=['*.py'])

    fch['file_owner'] = ''
    fch['refactor'] = 0
    fch['timestamp'] = fch.index.astype(np.int64) // (24 * 3600 * 10**9)
    fch['observed'] = False
    fch = fch.reindex()
    fch = fch.reset_index()

    # add in the file owner and whether or not each item is a refactor
    for idx, row in fch.iterrows():
        fch.set_value(idx, 'file_owner',
                      repo.file_owner(row.rev, row.filename))
        if abs(row.insertions - row.deletions) > threshold:
            fch.set_value(idx, 'refactor', 1)
        else:
            fch.set_value(idx, 'refactor', 0)
import matplotlib.pyplot as plt
import os
import json
from gitpandas import Repository, ProjectDirectory
import matplotlib
matplotlib.style.use('ggplot')

__author__ = 'willmcginnis'

if __name__ == '__main__':
    g = Repository(working_dir=os.path.abspath('../../git-pandas'),
                   verbose=True)

    b = g.cumulative_blame(branch='master',
                           extensions=['py'],
                           ignore_dir=['docs'],
                           limit=None,
                           skip=None)

    ax = b.plot(kind='area', stacked=True)
    plt.title('Cumulative Blame')
    plt.xlabel('date')
    plt.ylabel('LOC')
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.show()