def scatterplot(cls, df): Utils.check_and_make_dir("Figures/Scatterplots") df = df[(df['index'] != 'Overall') & (df['index'] != 'No ROI')] # Remove No ROI and Overall rows df = df.groupby([config.table_cols, config.table_rows]).apply( lambda x: x.sort_values(['Mean'])) # Group by parameters and sort df = df.reset_index(drop=True) # Reset index to remove grouping scatterplots = ['roi_ordered', 'stat_ordered'] if config.table_row_order == 'roi': scatterplots.remove('stat') elif config.table_row_order == 'statorder': scatterplots.remove('roi_ordered') for scatterplot in scatterplots: if config.verbose: print(f"Saving {scatterplot} scatterplot!") if scatterplot == 'roi_ordered': roi_ord = pd.Categorical(df['index'], categories=df['index'].unique() ) # Order rows based on first facet else: roi_ord = pd.Categorical( df.groupby(['MB', 'SENSE' ]).cumcount()) # Order each facet individually figure_table = ( pltn.ggplot(df, pltn.aes(x="Mean", y=roi_ord)) + pltn.geom_point(na_rm=True, size=1) + pltn.geom_errorbarh( pltn.aes(xmin="Mean-Conf_Int_95", xmax="Mean+Conf_Int_95"), na_rm=True, height=None) + pltn.xlim(0, None) + pltn.scale_y_discrete(labels=[]) + pltn.ylab(config.table_y_label) + pltn.xlab(config.table_x_label) + pltn.facet_grid('{rows}~{cols}'.format(rows=config.table_rows, cols=config.table_cols), drop=True, labeller="label_both") + pltn.theme_538() # Set theme + pltn.theme( panel_grid_major_y=pltn.themes.element_line(alpha=0), panel_grid_major_x=pltn.themes.element_line(alpha=1), panel_background=pltn.element_rect(fill="gray", alpha=0.1), dpi=config.plot_dpi)) figure_table.save( f"Figures/Scatterplots/{scatterplot}_scatterplot.png", height=config.plot_scale, width=config.plot_scale * 3, verbose=False, limitsize=False)
def setup_heatmap0(df: pd.DataFrame, format_string, axis_text): # https://stackoverflow.com/a/62161556/819272 # Plotnine does not support changing the position of any axis. return (p9.ggplot(df, p9.aes(y='row', x='col')) + p9.coord_equal() + p9.geom_tile(p9.aes(fill='scale')) + p9.geom_text( p9.aes(label='value'), format_string=format_string, size=7) + p9.scale_y_discrete(drop=False) + p9.scale_x_discrete(drop=False) + p9.scale_fill_gradientn(colors=['#63BE7B', '#FFEB84', '#F8696B'], na_value='#CCCCCC', guide=False) + p9.theme(axis_text=p9.element_blank() if not axis_text else p9.element_text(face='bold'), axis_ticks=p9.element_blank(), axis_title=p9.element_blank(), panel_grid=p9.element_blank()))
def scatter_plot(df, x, y, group=None, facet_x=None, facet_y=None, base_size=10, figure_size=(6, 3), **kwargs): ''' Aggregates data in df and plots as a scatter plot chart. Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis y : str quoted expression to be plotted on the y axis group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet base_size : int base size for theme_ez figure_size :tuple of int figure size **kwargs: additional kwargs passed to geom_point Returns ------- g : EZPlot EZplot object ''' # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x, group, facet_x, facet_y]): names[label], groups[label] = unname(var) names['y'], variables['y'] = unname(y) # fix special cases if x == '.index': groups['x'] = '.index' names[ 'x'] = dataframe.index.name if dataframe.index.name is not None else '' # aggregate data and reorder columns gdata = agg_data(dataframe, variables, groups, None, fill_groups=True) gdata = gdata[[ c for c in ['x', 'y', 'group', 'facet_x', 'facet_y'] if c in gdata.columns ]] # add group_x column if group is not None: gdata['group_x'] = gdata['group'].astype( 'str') + '_' + gdata['x'].astype(str) g = EZPlot(gdata) # set groups if group is None: g += p9.geom_point(p9.aes(x="x", y="y"), colour=ez_colors(1)[0], **kwargs) else: g += p9.geom_point( p9.aes(x="x", y="y", group="factor(group)", color="factor(group)"), **kwargs) g += p9.scale_color_manual(values=ez_colors(g.n_groups('group'))) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_timestamp('x'): g += p9.scale_x_datetime() elif g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale if g.column_is_timestamp('y'): g += p9.scale_y_datetime() elif g.column_is_categorical('y'): g += p9.scale_y_discrete() else: g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab(names['y']) # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) return g
def cli(): parser = argparse.ArgumentParser( description='GAP - Git Activity Predictor') parser.add_argument('paths', metavar='PATH', type=str, nargs='*', default=['.'], help='Paths to one or more git repositories') parser.add_argument( '--date', type=lambda d: dateutil.parser.parse(d).date(), required=False, default=datetime.date.today(), help='Date used for predictions (default to current date)') parser.add_argument('--obs', type=int, required=False, default=20, help='Number of observations to consider') parser.add_argument('--probs', metavar='PROB', type=float, nargs='*', required=False, default=[0.5, 0.6, 0.7, 0.8, 0.9], help='Probabilities to output, strictly in [0,1].') parser.add_argument( '--limit', type=int, required=False, default=30, help= 'Limit contributors to the one that were active at least once during the last x days (default 30)' ) parser.add_argument( '--mapping', type=str, nargs='?', help= 'Mapping file to merge identities. This file must be a csv file where each line contains two values: the name to be merged, and the corresponding identity. Use "IGNORE" as identity to ignore specific names.' ) parser.add_argument('--branches', metavar='BRANCH', type=str, nargs='*', default=list(), help='Git branches to analyse (default to all).') parser.add_argument( '--as-dates', dest='as_dates', action='store_true', help= 'Express predictions using dates instead of time differences in days') group = parser.add_mutually_exclusive_group() group.add_argument('--text', action='store_true', help='Print results as text.') group.add_argument('--csv', action='store_true', help='Print results as csv.') group.add_argument('--json', action='store_true', help='Print results as json.') group.add_argument( '--plot', nargs='?', const=True, help='Export results to a plot. Filepath can be optionaly specified.') args = parser.parse_args() # Default plot location if args.plot is True: args.plot = str(args.date) + '.pdf' # Default to text if not other option is provided if not args.csv and not args.json and not args.plot: args.text = True # Identity mapping if args.mapping: d = pandas.read_csv(args.mapping, names=['source', 'target']) mapping = {r.source: r.target for r in d.itertuples()} else: mapping = {} raw_data = dict() # author -> dates of activity # Get data from git for path in args.paths: try: repo = git.Repo(path) except Exception as e: # Must be refined print('Unable to access repository {} ({}:{})'.format( path, e.__class__.__name__, e)) sys.exit() # Default branches if len(args.branches) == 0: commits = repo.iter_commits('--all') else: commits = repo.iter_commits(' '.join(args.branches)) for commit in commits: try: author = commit.author.name identity = mapping.get(author, author) if author.lower() != 'ignore' and identity.lower() == 'ignore': continue date = datetime.date.fromtimestamp(commit.authored_date) raw_data.setdefault(identity, []).append(date) except Exception as e: print('Unable to read commit ({}: {}): {}'.format( e.__class__.__name__, e, commit)) # Compute durations and apply model data = [] # (author, past activities, predicted durations) for author, commits in raw_data.items(): commits = sorted([e for e in commits if e <= args.date]) durations = dates_to_duration(commits, window_size=args.obs) if len(durations) >= args.obs: # Currently implemented with no censor surv = SurvfuncRight(durations, [1] * len(durations)) predictions = [surv.quantile(p) for p in args.probs] last_day = commits[-1] if last_day >= args.date - datetime.timedelta(args.limit): data.append(( author, commits, predictions, )) # Prepare dataframe df = pandas.DataFrame(index=set([a for a, c, p in data]), columns=['last'] + args.probs) if len(df) == 0: print( 'No author has {} observations and was active at least once during the last {} days' .format(args.obs, args.limit)) sys.exit() df.index.name = 'author' if not args.plot: for author, commits, predictions in data: last = commits[-1] if args.as_dates: df.at[author, 'last'] = last else: df.at[author, 'last'] = (last - args.date).days for prob, p in zip(args.probs, predictions): if args.as_dates: df.at[author, prob] = last + datetime.timedelta(days=int(p)) else: df.at[author, prob] = (last + datetime.timedelta(days=int(p)) - args.date).days df = df.sort_values(['last'] + args.probs, ascending=[False] + [True] * len(args.probs)) df = df.astype(str) if args.text: pandas.set_option('expand_frame_repr', False) pandas.set_option('display.max_columns', 999) print(df) elif args.csv: print(df.to_csv()) elif args.json: print(df.to_json(orient='index')) else: # Because of plotnine's way of initializing matplotlib import warnings warnings.filterwarnings("ignore") VIEW_LIMIT = 28 activities = [ ] # List of (author, day) where day is a delta w.r.t. given date forecasts = [ ] # List of (author, from_day, to_day, p) where probability p # applies between from_day and to_day (delta w.r.t. given date) for author, commits, predictions in data: last = (commits[-1] - args.date).days for e in commits: activities.append((author, (e - args.date).days)) previous = previous_previous = 0 for d, p in zip(predictions, args.probs): if d > previous: forecasts.append((author, last + previous, last + d, p)) previous_previous = previous previous = d else: forecasts.append( (author, last + previous_previous, last + d, p)) activities = pandas.DataFrame(columns=['author', 'day'], data=activities) forecasts = pandas.DataFrame(columns=['author', 'fromd', 'tod', 'p'], data=forecasts) plot = (p9.ggplot(p9.aes(y='author')) + p9.geom_segment( p9.aes('day - 0.5', 'author', xend='day + 0.5', yend='author'), data=activities, size=4, color='orange', ) + p9.geom_segment( p9.aes('fromd + 0.5', 'author', xend='tod + 0.5', yend='author', alpha='factor(p)'), data=forecasts.sort_values('p').drop_duplicates( ['author', 'fromd', 'tod'], keep='last'), size=4, color='steelblue', ) + p9.geom_vline( xintercept=0, color='r', alpha=0.5, linetype='dashed') + p9.scale_x_continuous( name=' << past days {:^20} future days >>'.format( str(args.date)), breaks=range(-VIEW_LIMIT // 7 * 7, (VIEW_LIMIT // 7 * 7) + 1, 7), minor_breaks=6) + p9.scale_y_discrete( name='', limits=activities.sort_values( 'day', ascending=False)['author'].unique()) + p9.scale_alpha_discrete(range=(0.2, 1), name=' ') + p9.coord_cartesian(xlim=(-VIEW_LIMIT, VIEW_LIMIT)) + p9.theme_matplotlib() + p9.theme( figure_size=(6, 4 * activities['author'].nunique() / 15))) fig = plot.draw() fig.savefig(args.plot, bbox_inches='tight') print('Plot exported to {}'.format(args.plot))
def hist_plot(df, x, y=None, group = None, facet_x = None, facet_y = None, w='1', bins=21, bin_width = None, position = 'stack', normalize = False, sort_groups=True, base_size=10, figure_size=(6, 3)): ''' Plot a 1-d or 2-d histogram Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis y : str quoted expression to be plotted on the y axis. If this is specified the histogram will be 2-d. group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet w : str quoted expression representing histogram weights (default is 1) bins : int or tuple number of bins to be used bin_width : float or tuple bin width to be used position : str if groups are present, choose between `stack`, `overlay` or `dodge` normalize : bool normalize histogram counts sort_groups : bool sort groups by the sum of their value (otherwise alphabetical order is used) base_size : int base size for theme_ez figure_size :tuple of int figure size Returns ------- g : EZPlot EZplot object ''' if position not in ['overlay', 'stack', 'dodge']: log.error("position not recognized") raise NotImplementedError("position not recognized") if (bins is None) and (bin_width is None): log.error("Either bins or bin_with should be defined") raise ValueError("Either bins or bin_with should be defined") if (bins is not None) and (bin_width is not None): log.error("Only one between bins or bin_with should be defined") raise ValueError("Only one between bins or bin_with should be defined") if (y is not None) and (group is not None): log.error("y and group cannot be requested at the same time") raise ValueError("y and group cannot be requested at the same time") if y is None: bins = (bins, bins) bin_width = (bin_width, bin_width) else: if type(bins) not in [tuple, list]: bins = (bins, bins) if type(bin_width) not in [tuple, list]: bin_width = (bin_width, bin_width) # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'y', 'group', 'facet_x', 'facet_y'], [x, y, group, facet_x, facet_y]): names[label], groups[label] = unname(var) names['w'], variables['w'] = unname(w) # set column names and evaluate expressions tmp_df = agg_data(dataframe, variables, groups, None, fill_groups=False) # redefine groups and variables; remove and store (eventual) names new_groups = {c:c for c in tmp_df.columns if c in ['x', 'y', 'group', 'facet_x', 'facet_y']} non_xy_groups = [g for g in new_groups.keys() if g not in ['x', 'y']] new_variables = {'w':'w'} # bin data (if necessary) if tmp_df['x'].dtypes != np.dtype('O'): tmp_df['x'], bins_x, bin_width_x= bin_data(tmp_df['x'], bins[0], bin_width[0]) else: bin_width_x=1 if y is not None: if tmp_df['y'].dtypes != np.dtype('O'): tmp_df['y'], bins_y, bin_width_y = bin_data(tmp_df['y'], bins[1], bin_width[1]) else: bin_width_y=1 else: bin_width_y=1 # aggregate data and reorder columns gdata = agg_data(tmp_df, new_variables, new_groups, 'sum', fill_groups=True) gdata.fillna(0, inplace=True) gdata = gdata[[c for c in ['x', 'y', 'w', 'group', 'facet_x', 'facet_y'] if c in gdata.columns]] # normalize if normalize: if len(non_xy_groups)==0: gdata['w'] = gdata['w']/(gdata['w'].sum()*bin_width_x*bin_width_y) else: gdata['w'] = gdata.groupby(non_xy_groups)['w'].apply(lambda x: x/(x.sum()*bin_width_x*bin_width_y)) # start plotting g = EZPlot(gdata) # determine order and create a categorical type if (group is not None) and sort_groups: if g.column_is_categorical('x'): g.sort_group('x', 'w', ascending=False) g.sort_group('group', 'w') g.sort_group('facet_x', 'w', ascending=False) g.sort_group('facet_y', 'w', ascending=False) if groups: colors = np.flip(ez_colors(g.n_groups('group'))) elif (group is not None): colors = ez_colors(g.n_groups('group')) if y is None: # set groups if group is None: g += p9.geom_bar(p9.aes(x="x", y="w"), stat = 'identity', colour = None, fill = ez_colors(1)[0]) else: g += p9.geom_bar(p9.aes(x="x", y="w", group="factor(group)", fill="factor(group)"), colour=None, stat = 'identity', **POSITION_KWARGS[position]) g += p9.scale_fill_manual(values=colors) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab('Counts') # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) if sort_groups: g += p9.guides(fill=p9.guide_legend(reverse=True)) else: g += p9.geom_tile(p9.aes(x="x", y="y", fill='w'), stat = 'identity', colour = None) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale if g.column_is_categorical('y'): g += p9.scale_y_discrete() else: g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab(names['y']) # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text='Counts', size=base_size)) return g
def plot_pointgraph( plot_df, x_axis_label, left_arrow_label, right_arrow_label, left_arrow_start=-0.5, left_arrow_height=38.5, right_arrow_start=0.5, right_arrow_height=1.5, arrow_length=2, left_arrow_label_x=-1.5, left_arrow_label_y=-1.5, right_arrow_label_x=-1.5, right_arrow_label_y=-1.5, limits=(-3, 3), ): """ This function is designed to plot the an errorbar graph to show each token's odd ratio. The main idea for this graph is to show which corpora a token is enriched Args: plot_df - the data frame to plot, x_axis_label - the label of the x axis, left_arrow_label - the label for the left arrow, right_arrow_label - the label for the right arrow, left_arrow_start - the start of the left arrow to be plotted left_arrow_height - the height at which the arrow needs to be plotted right_arrow_start - the start of the right arrow to be plotted right_arrow_height - - the height at which the arrow needs to be plotted arrow_length - the length of the arrow left_arrow_label_x - the x axis position for the label of the left arrow left_arrow_label_y - the y axis position for the label of the left arrow right_arrow_label_x - the x axis position for the label of the right arrow right_arrow_label_y - the y axis position for the label of the right arrow limits=(-3,3) """ graph = (p9.ggplot( plot_df.assign(lemma=lambda x: pd.Categorical(x.lemma.tolist())), p9.aes( y="lemma", xmin="lower_odds", x="odds_ratio", xmax="upper_odds", yend="lemma", ), ) + p9.geom_errorbarh(color="#253494") + p9.scale_y_discrete(limits=( plot_df.sort_values("odds_ratio", ascending=True).lemma.tolist())) + p9.scale_x_continuous(limits=limits) + p9.geom_vline(p9.aes(xintercept=0), linetype="--", color="grey") + p9.annotate( "segment", x=left_arrow_start, xend=left_arrow_start - arrow_length, y=left_arrow_height, yend=left_arrow_height, colour="black", size=0.5, alpha=1, arrow=p9.arrow(length=0.1), ) + p9.annotate( "text", label=left_arrow_label, x=left_arrow_label_x, y=left_arrow_label_y, size=12, alpha=0.7, ) + p9.annotate( "segment", x=right_arrow_start, xend=right_arrow_start + arrow_length, y=right_arrow_height, yend=right_arrow_height, colour="black", size=0.5, alpha=1, arrow=p9.arrow(length=0.1), ) + p9.annotate( "text", label=right_arrow_label, x=right_arrow_label_x, y=right_arrow_label_y, size=12, alpha=0.7, ) + p9.theme_seaborn( context="paper", style="ticks", font_scale=1, font="Arial") + p9.theme( figure_size=(11, 8.5), panel_grid_minor=p9.element_blank(), text=p9.element_text(size=12), ) + p9.labs(y=None, x=x_axis_label)) return graph
odds_ratio=lambda x: x.odds_ratio.apply(lambda x: np.log2(x)), lower_odds=lambda x: x.lower_odds.apply(lambda x: np.log2(x)), upper_odds=lambda x: x.upper_odds.apply(lambda x: np.log2(x)), )) plot_df.head() g = (p9.ggplot( plot_df.assign(lemma=lambda x: pd.Categorical(x.lemma.tolist())), p9.aes( y="lemma", xmin="lower_odds", x="odds_ratio", xmax="upper_odds", yend="lemma", ), ) + p9.geom_errorbarh(color="#253494") + p9.scale_y_discrete(limits=( plot_df.sort_values("odds_ratio", ascending=True).lemma.tolist())) + p9.scale_x_continuous(limits=(-3, 3)) + p9.geom_vline(p9.aes(xintercept=0), linetype="--", color="grey") + p9.annotate( "segment", x=0.5, xend=2.5, y=1.5, yend=1.5, colour="black", size=0.5, alpha=1, arrow=p9.arrow(length=0.1), ) + p9.annotate( "text", label="bioRxiv Enriched", x=1.5, y=2.5, size=18, alpha=0.7) + p9.annotate(
plot_df = (full_plot_df.sort_values( "odds_ratio", ascending=False).head(subset).append( full_plot_df.sort_values("odds_ratio", ascending=False).iloc[:-2].tail( subset)).replace("rna", "RNA").assign( odds_ratio=lambda x: x.odds_ratio.apply(lambda x: np.log2(x)), lower_odds=lambda x: x.lower_odds.apply(lambda x: np.log2(x)), upper_odds=lambda x: x.upper_odds.apply(lambda x: np.log2(x)), )) plot_df.head() g = (p9.ggplot( plot_df, p9.aes(y="lemma", x="lower_odds", xend="upper_odds", yend="lemma")) + p9.geom_segment(color="#253494", size=6, alpha=0.7) + p9.scale_y_discrete(limits=( plot_df.sort_values("odds_ratio", ascending=True).lemma.tolist())) + p9.scale_x_continuous(limits=(-3, 3)) + p9.geom_vline(p9.aes(xintercept=0), linetype="--", color="grey") + p9.annotate( "segment", x=0.5, xend=2.5, y=1.5, yend=1.5, colour="black", size=0.5, alpha=1, arrow=p9.arrow(length=0.1), ) + p9.annotate( "text", label="bioRxiv Enriched", x=1.5, y=2.5, size=14, alpha=0.7) + p9.annotate(