def main(repo, out_dir, clobber_output, verbose): """ """ import logging from gitpandas import Repository if verbose: logging.getLogger().setLevel(10) if repo.find("git@") == 0: logging.info("Cloning repo %s" % repo) repository = Repository(working_dir=repo) repo = repository.git_dir logging.info("Repo located at %s" % repo) if out_dir is None: out_dir = os.path.join(os.getcwd(), OUT_SUBFOLDER) verify_local_repo_location(repo) repo_name = os.path.basename(repo) make_output_folder(out_dir, overwrite=clobber_output) contributor_data = author_minded(repo) citation_data = pmc_data('SPSS') logging.info("output path: %s" % os.path.join(out_dir, 'contributor_data.json')) contributor_data.to_json(os.path.join(out_dir, 'contributor_data.json'), date_format='iso') citation_data['citations'].to_json( os.path.join(out_dir, 'citation_data.json'))
def repository(path): # build an example repository object and try some things out ignore_dirs = [ 'docs/*', 'tests/*', 'Data/*' ] r = Repository(path) # is it bare? print('\nRepo bare?') print(r.is_bare()) print('\n') # get the commit history ch = r.commit_history('HEAD', limit=None, include_globs=['*.py'], ignore_globs=ignore_dirs) print(ch.head(5)) # get the list of committers print('\nCommiters:') print(''.join([str(x) + '\n' for x in set(ch['committer'].values)])) print('\n') # print out everyone's contributions attr = ch.reindex(columns=['committer', 'lines', 'insertions', 'deletions']).groupby(['committer']) attr = attr.agg({ 'lines': np.sum, 'insertions': np.sum, 'deletions': np.sum }) print(attr) # get the file change history fh = r.file_change_history('HEAD', limit=None, ignore_globs=ignore_dirs) fh['ext'] = fh['filename'].map(lambda x: x.split('.')[-1]) print(fh.head(50)) # print out unique extensions print('\nExtensions Found:') print(''.join([str(x) + '\n' for x in set(fh['ext'].values)])) print('\n') # agg by extension etns = fh.reindex(columns=['ext', 'insertions', 'deletions']).groupby(['ext']) etns = etns.agg({ 'insertions': np.sum, 'deletions': np.sum }) print(etns)
def author_minded(working_dir, frequency=None): from numpy import median, min, max, diff, nan, timedelta64 from pandas import DataFrame from gitpandas import Repository from itertools import groupby if frequency is None: frequency = timedelta64(0, 'D') repo = Repository(working_dir=working_dir) commits = repo.commit_history() authors = set(commits.author) tot_lines = float(commits.lines.sum()) result = { 'first': [], 'last': [], 'line_changes': [], 'commits': [], 'median_commit_frequency': [], 'max_dry_stretch': [], 'max_dayly_commit_run': [] } for author in authors: specific = commits[commits.author == author] result['first'].append(specific.index.min()) result['last'].append(specific.index.max()) result['line_changes'].append(specific.lines.sum() / tot_lines) result['commits'].append(len(specific) / float(len(commits))) deriv = diff(specific.index[::-1]) if len(deriv) == 0: result['median_commit_frequency'].append(nan) result['max_dry_stretch'].append(nan) result['max_dayly_commit_run'].append(nan) else: result['median_commit_frequency'].append( median(deriv).astype('timedelta64[D]')) result['max_dry_stretch'].append( max(deriv).astype('timedelta64[D]')) result['max_dayly_commit_run'].append( max([ len(list(u)) for k, u in groupby(deriv.astype('timedelta64[D]')) if k <= frequency ])) return DataFrame(result, index=authors)
def main(): r = Repository(working_dir=os.path.abspath('../nixpkgs')) logger.info('fetching commit history') ch = fetch_commit_history(r) logger.info('fetching hours estimate') he = fetch_hours_estimate(r, ch) logger.info('fetching file change history') fch = fetch_file_change_history(r) logger.info('fetching file change rate') fcr = fetch_file_change_rate(r, fch) logger.info('fetching cumulative blame') cb = fetch_cumulative_blame(r) logger.info('fetching bus factor') bf = fetch_bus_factor(r) logger.info('fetching file owner') fo = fetch_file_owner(r) logger.info('fetching punch card') pc = fetch_punch_card(r)
def setUp(self): """ :return: """ project_dir = str(os.path.dirname( os.path.abspath(__file__))) + os.sep + 'repos' repo_dir = str(os.path.dirname(os.path.abspath( __file__))) + os.sep + 'repos' + os.sep + 'repository1' if os.path.exists(project_dir): shutil.rmtree(project_dir) os.makedirs(project_dir) if not os.path.exists(repo_dir): os.makedirs(repo_dir) # create an empty repo (but not bare) grepo = git.Repo.init(repo_dir) # add a file with open(repo_dir + os.sep + 'README.md', 'w') as f: f.write('Sample README for a sample project\n') # commit it grepo.git.add('README.md') grepo.git.commit(m='first commit') # now add some other files: for idx in range(5): with open(repo_dir + os.sep + 'file_%d.py' % (idx, ), 'w') as f: f.write('import sys\nimport os\n') time.sleep(2.0) grepo.git.add(all=True) grepo.git.commit(m='adding file_%d.py' % (idx, )) self.repo = Repository(working_dir=repo_dir, verbose=True)
""" Assumes that GitPython and pandas are in the same directory as this repo, and nothing else is in that directory. """ from gitpandas import Repository __author__ = 'willmcginnis' if __name__ == '__main__': flask_repo = Repository(working_dir='git://github.com/mitsuhiko/flask.git') # do some blaming flask_blame = flask_repo.blame(include_globs=['*.py']) # figure out committer count from each flask_ch = flask_repo.commit_history('master', limit=None, include_globs=['*.py']) print('\tflask committers: %d' % (len(set(flask_ch['committer'].values)))) print('\tflask bus count:') print(flask_repo.bus_factor(include_globs=['*.py']))
from gitpandas import Repository import time from definitions import GIT_PANDAS_DIR __author__ = 'willmcginnis' if __name__ == '__main__': g = Repository(working_dir=GIT_PANDAS_DIR) st = time.time() blame = g.cumulative_blame(branch='master', include_globs=['*.py', '*.html', '*.sql', '*.md'], limit=None, skip=None) print(blame.head()) print(time.time() - st) st = time.time() blame = g.parallel_cumulative_blame(branch='master', include_globs=['*.py', '*.html', '*.sql', '*.md'], limit=None, skip=None, workers=4) print(blame.head()) print(time.time() - st)
import os from gitpandas import Repository __author__ = 'willmcginnis' if __name__ == '__main__': repo = Repository(working_dir=os.path.abspath('../../git-pandas')) fc = repo.file_change_rates(include_globs=['*.py'], coverage=True) print(fc)
def setUp(self): self.repo = Repository( working_dir='git://github.com/wdm0006/git-pandas.git', verbose=True)
from gitpandas import Repository __author__ = 'willmcginnis' if __name__ == '__main__': repo = Repository( working_dir='git://github.com/CamDavidsonPilon/lifelines.git', verbose=True) shared_blame = repo.blame(extensions=['py'], committer=False, by='file') print(shared_blame)
from gitpandas import Repository import numpy as np import lifelines import matplotlib.pyplot as plt plt.style.use('ggplot') __author__ = 'willmcginnis' if __name__ == '__main__': threshold = 100 repo = Repository( working_dir='git://github.com/scikit-learn/scikit-learn.git', verbose=True) fch = repo.file_change_history(limit=None, include_globs=['*.py']) fch['file_owner'] = '' fch['refactor'] = 0 fch['timestamp'] = fch.index.astype(np.int64) // (24 * 3600 * 10**9) fch['observed'] = False fch = fch.reindex() fch = fch.reset_index() # add in the file owner and whether or not each item is a refactor for idx, row in fch.iterrows(): fch.set_value(idx, 'file_owner', repo.file_owner(row.rev, row.filename)) if abs(row.insertions - row.deletions) > threshold: fch.set_value(idx, 'refactor', 1) else: fch.set_value(idx, 'refactor', 0)
import matplotlib.pyplot as plt import os import json from gitpandas import Repository, ProjectDirectory import matplotlib matplotlib.style.use('ggplot') __author__ = 'willmcginnis' if __name__ == '__main__': g = Repository(working_dir=os.path.abspath('../../git-pandas'), verbose=True) b = g.cumulative_blame(branch='master', extensions=['py'], ignore_dir=['docs'], limit=None, skip=None) ax = b.plot(kind='area', stacked=True) plt.title('Cumulative Blame') plt.xlabel('date') plt.ylabel('LOC') box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plt.show()