def bsuite_bar_plot(df_in: pd.DataFrame, sweep_vars: Sequence[str] = None) -> gg.ggplot: """Output bar plot of bsuite data.""" df = _clean_bar_plot_data(df_in, sweep_vars) p = (gg.ggplot(df) + gg.aes(x='env', y='score', colour='type', fill='type') + gg.geom_bar(position='dodge', stat='identity') + gg.geom_hline(yintercept=1., linetype='dashed', alpha=0.5) + gg.scale_colour_manual(plotting.CATEGORICAL_COLOURS) + gg.scale_fill_manual(plotting.CATEGORICAL_COLOURS) + gg.xlab('experiment') + gg.theme(axis_text_x=gg.element_text(angle=25, hjust=1)) ) if not all(df.finished): # add a layer of alpha for unfinished jobs p += gg.aes(alpha='finished') p += gg.scale_alpha_discrete(range=[0.3, 1.0]) # Compute the necessary size of the plot if sweep_vars: p += gg.facet_wrap(sweep_vars, labeller='label_both', ncol=1) n_hypers = df[sweep_vars].drop_duplicates().shape[0] else: n_hypers = 1 return p + gg.theme(figure_size=(14, 3 * n_hypers + 1))
def _bar_plot_compare(df: pd.DataFrame) -> gg.ggplot: """Bar plot of buite score data, comparing agents on each experiment.""" p = (gg.ggplot(df) + gg.aes(x='agent', y='score', colour='agent', fill='agent') + gg.geom_bar(position='dodge', stat='identity') + gg.geom_hline(yintercept=1., linetype='dashed', alpha=0.5) + gg.theme(axis_text_x=gg.element_text(angle=25, hjust=1)) + gg.scale_colour_manual(plotting.CATEGORICAL_COLOURS) + gg.scale_fill_manual(plotting.CATEGORICAL_COLOURS)) if not all(df.finished): # add a layer of alpha for unfinished jobs p += gg.aes(alpha='finished') p += gg.scale_alpha_discrete(range=[0.3, 1.0]) return p
def cli(): parser = argparse.ArgumentParser( description='GAP - Git Activity Predictor') parser.add_argument('paths', metavar='PATH', type=str, nargs='*', default=['.'], help='Paths to one or more git repositories') parser.add_argument( '--date', type=lambda d: dateutil.parser.parse(d).date(), required=False, default=datetime.date.today(), help='Date used for predictions (default to current date)') parser.add_argument('--obs', type=int, required=False, default=20, help='Number of observations to consider') parser.add_argument('--probs', metavar='PROB', type=float, nargs='*', required=False, default=[0.5, 0.6, 0.7, 0.8, 0.9], help='Probabilities to output, strictly in [0,1].') parser.add_argument( '--limit', type=int, required=False, default=30, help= 'Limit contributors to the one that were active at least once during the last x days (default 30)' ) parser.add_argument( '--mapping', type=str, nargs='?', help= 'Mapping file to merge identities. This file must be a csv file where each line contains two values: the name to be merged, and the corresponding identity. Use "IGNORE" as identity to ignore specific names.' ) parser.add_argument('--branches', metavar='BRANCH', type=str, nargs='*', default=list(), help='Git branches to analyse (default to all).') parser.add_argument( '--as-dates', dest='as_dates', action='store_true', help= 'Express predictions using dates instead of time differences in days') group = parser.add_mutually_exclusive_group() group.add_argument('--text', action='store_true', help='Print results as text.') group.add_argument('--csv', action='store_true', help='Print results as csv.') group.add_argument('--json', action='store_true', help='Print results as json.') group.add_argument( '--plot', nargs='?', const=True, help='Export results to a plot. Filepath can be optionaly specified.') args = parser.parse_args() # Default plot location if args.plot is True: args.plot = str(args.date) + '.pdf' # Default to text if not other option is provided if not args.csv and not args.json and not args.plot: args.text = True # Identity mapping if args.mapping: d = pandas.read_csv(args.mapping, names=['source', 'target']) mapping = {r.source: r.target for r in d.itertuples()} else: mapping = {} raw_data = dict() # author -> dates of activity # Get data from git for path in args.paths: try: repo = git.Repo(path) except Exception as e: # Must be refined print('Unable to access repository {} ({}:{})'.format( path, e.__class__.__name__, e)) sys.exit() # Default branches if len(args.branches) == 0: commits = repo.iter_commits('--all') else: commits = repo.iter_commits(' '.join(args.branches)) for commit in commits: try: author = commit.author.name identity = mapping.get(author, author) if author.lower() != 'ignore' and identity.lower() == 'ignore': continue date = datetime.date.fromtimestamp(commit.authored_date) raw_data.setdefault(identity, []).append(date) except Exception as e: print('Unable to read commit ({}: {}): {}'.format( e.__class__.__name__, e, commit)) # Compute durations and apply model data = [] # (author, past activities, predicted durations) for author, commits in raw_data.items(): commits = sorted([e for e in commits if e <= args.date]) durations = dates_to_duration(commits, window_size=args.obs) if len(durations) >= args.obs: # Currently implemented with no censor surv = SurvfuncRight(durations, [1] * len(durations)) predictions = [surv.quantile(p) for p in args.probs] last_day = commits[-1] if last_day >= args.date - datetime.timedelta(args.limit): data.append(( author, commits, predictions, )) # Prepare dataframe df = pandas.DataFrame(index=set([a for a, c, p in data]), columns=['last'] + args.probs) if len(df) == 0: print( 'No author has {} observations and was active at least once during the last {} days' .format(args.obs, args.limit)) sys.exit() df.index.name = 'author' if not args.plot: for author, commits, predictions in data: last = commits[-1] if args.as_dates: df.at[author, 'last'] = last else: df.at[author, 'last'] = (last - args.date).days for prob, p in zip(args.probs, predictions): if args.as_dates: df.at[author, prob] = last + datetime.timedelta(days=int(p)) else: df.at[author, prob] = (last + datetime.timedelta(days=int(p)) - args.date).days df = df.sort_values(['last'] + args.probs, ascending=[False] + [True] * len(args.probs)) df = df.astype(str) if args.text: pandas.set_option('expand_frame_repr', False) pandas.set_option('display.max_columns', 999) print(df) elif args.csv: print(df.to_csv()) elif args.json: print(df.to_json(orient='index')) else: # Because of plotnine's way of initializing matplotlib import warnings warnings.filterwarnings("ignore") VIEW_LIMIT = 28 activities = [ ] # List of (author, day) where day is a delta w.r.t. given date forecasts = [ ] # List of (author, from_day, to_day, p) where probability p # applies between from_day and to_day (delta w.r.t. given date) for author, commits, predictions in data: last = (commits[-1] - args.date).days for e in commits: activities.append((author, (e - args.date).days)) previous = previous_previous = 0 for d, p in zip(predictions, args.probs): if d > previous: forecasts.append((author, last + previous, last + d, p)) previous_previous = previous previous = d else: forecasts.append( (author, last + previous_previous, last + d, p)) activities = pandas.DataFrame(columns=['author', 'day'], data=activities) forecasts = pandas.DataFrame(columns=['author', 'fromd', 'tod', 'p'], data=forecasts) plot = (p9.ggplot(p9.aes(y='author')) + p9.geom_segment( p9.aes('day - 0.5', 'author', xend='day + 0.5', yend='author'), data=activities, size=4, color='orange', ) + p9.geom_segment( p9.aes('fromd + 0.5', 'author', xend='tod + 0.5', yend='author', alpha='factor(p)'), data=forecasts.sort_values('p').drop_duplicates( ['author', 'fromd', 'tod'], keep='last'), size=4, color='steelblue', ) + p9.geom_vline( xintercept=0, color='r', alpha=0.5, linetype='dashed') + p9.scale_x_continuous( name=' << past days {:^20} future days >>'.format( str(args.date)), breaks=range(-VIEW_LIMIT // 7 * 7, (VIEW_LIMIT // 7 * 7) + 1, 7), minor_breaks=6) + p9.scale_y_discrete( name='', limits=activities.sort_values( 'day', ascending=False)['author'].unique()) + p9.scale_alpha_discrete(range=(0.2, 1), name=' ') + p9.coord_cartesian(xlim=(-VIEW_LIMIT, VIEW_LIMIT)) + p9.theme_matplotlib() + p9.theme( figure_size=(6, 4 * activities['author'].nunique() / 15))) fig = plot.draw() fig.savefig(args.plot, bbox_inches='tight') print('Plot exported to {}'.format(args.plot))