def limits(x, y=None, xbreaks=None, ybreaks=None): if y is None: y = x x0, x1 = x y0, y1 = y if xbreaks is None: xbreaks = np.linspace(x0, x1, x1 - x0 + 1) if ybreaks is None: ybreaks = np.linspace(y0, y1, y1 - y0 + 1) # We want these plots to continue to the top and left. return [ gg.coord_cartesian(xlim=x, ylim=y), gg.scale_x_continuous(limits=(x0, None), breaks=xbreaks), gg.scale_y_continuous(limits=(y0, None), breaks=ybreaks) ] return [ gg.scale_x_continuous(limits=x, breaks=xbreaks), gg.scale_y_continuous(limits=y, breaks=ybreaks) ]
def test_text_aesthetics(): p = (ggplot(df, aes(y='y', label='label')) + geom_text(aes('x', label='label'), size=15, ha='left') + geom_text(aes('x+1', angle='angle'), size=15, va='top', show_legend=False) + geom_text(aes('x+2', label='label', alpha='z'), size=15, show_legend=False) + geom_text(aes('x+3', color='factor(z)'), size=15, show_legend=False) + geom_text(aes('x+5', size='z'), ha='right', show_legend=False) + scale_size_continuous(range=(12, 30)) + scale_y_continuous(limits=(-0.5, n-0.5))) assert p == 'text_aesthetics'
"R": "R", "java": "Java", "scala": "Scala", "C": "C", "sas": "SAS" } skills_summary_lang = skills_summary_df[skills_summary_df.attribute.isin( languages)] skills_summary_lang = skills_summary_lang.replace(to_replace=lang_clean) skills_summary_lang = sort_df(skills_summary_lang, var_col="attribute") lang_plot = ( p9.ggplot(skills_summary_lang, p9.aes('attribute', 'value', fill='type', show_legend=False)) + p9.geom_col() + p9.coord_flip() + p9.scale_y_continuous(expand=[0, 0]) + p9.labs(y="Frequency", x="Language", fill="") + p9.scale_fill_brewer(palette="Blues") + p9.facet_wrap('~type')) lang_plot.save(filename='figs/lang_plot.png', height=5, width=5, units='in', dpi=1000) lang_plot #Software programs = ["tableau", "docker", "bigquery", "jira", "spark", "hadoop"] prog_clean = { "tableau": "Tableau", "docker": "Docker",
def plot_bar(data,nuclstr,column='value',factor=None,ymin=None,ymax=None,stat='identity',dpi=300,features=None,feature_types=['all'],add_features=[],funcgroups=None,shading_modes=['charge_functional'],usd=False,right_overhang_fix=None,debug=False,startnumber=1,cropseq=(0,None),aspect_ratio=None,reverse_seq=False,double_seq=False,transparent=True,fill_params=None,bar_position='stack',title=None): """ A wrapper function to make a plot of data with bars along the sequnce input should be a dataframe with resid, segid column and 'value' This one is inspired by seqplot/seqplot/pdb_plot.py """ segid=data['segid'].values[0] if title is None: title="Segid: %s, Type: %s"%(segid,nuclstr.components[segid]['type']) seq=Seq(str(nuclstr.seqs[segid]['fullseq']),generic_protein \ if nuclstr.components[segid]['entity'] is 'DNA' or 'histone' or 'protein' else generic_dna) msar=MultipleSeqAlignment([SeqRecord(seq=seq,id=nuclstr.components[segid]['type']+':'+segid,\ name=nuclstr.components[segid]['type']+':'+segid)]) if(reverse_seq): logger.info("Experimental feature will reverse the sequence") msar[0].seq=msar[0].seq[::-1] if double_seq: msar.add_sequence('reverse',str(msar[0].seq[::-1])) msar=msar[:,cropseq[0]:cropseq[1]] # print("Seq to plot:",msar) #We need to get starting residue, currently for DNA chains only cifseq gets it correctly resid_start=nuclstr.seqs[segid]['resid_start'] logger.debug("Starting resid",resid_start) overhang=nuclstr.seqs[segid]['overhangL'] datafixed=data.copy() datafixed.loc[:,'resid']=datafixed.loc[:,'resid']-resid_start+overhang+1-cropseq[0] sl=len(msar[0].seq) # fn=shade.seqfeat2shadefeat(msar,feature_types=feature_types,force_feature_pos='bottom',debug=debug) if features is None: fn=nuclstr.shading_features[segid] else: fn=features fn2=[] for i in fn: if (i['style'] in feature_types) or ('all' in feature_types) : fn2.append(i) fn2.extend(add_features) if usd: ruler='top' else: ruler=None shaded=ipyshade.shadedmsa4plot(msar,features=fn2,shading_modes=shading_modes,debug=debug,startnumber=startnumber,setends=[startnumber-2,sl+startnumber+2],funcgroups=funcgroups,ruler=ruler,density=200) #If sl%10=10 se will have a ruler number hanging beyond the sequence image, and we need to correct for that. if right_overhang_fix is None: if sl%10==0: if sl<100: rof= 0.1 else: rof=0.5 else: rof=0 else: rof=right_overhang_fix if (not aspect_ratio is None ): ar=aspect_ratio else: ar=0.2*100./sl # print(datafixed) plot=(ggplot(data=datafixed,mapping=aes(x='resid', y=column)) # + geom_point(size=0.1) # +geom_bar(stat='identity',width=0.5,mapping=aes(fill=factor)) + scale_x_continuous(limits=(0.5,sl+0.5+rof),expand=(0,0.2),name='',breaks=[]) # + scale_y_continuous(breaks=[0,0.5,1.0]) + theme_light()+theme(aspect_ratio=ar,dpi=dpi,plot_margin=0,text=element_text(size=6), legend_key_size=5 ,legend_position='bottom',legend_direction='horizontal')) #+ facet_wrap('~ segid',dir='v') +guides(color=guide_legend(ncol=10)) if factor is None: plot=plot+geom_bar(stat=stat,width=0.5) else: plot=plot+geom_bar(stat=stat,width=0.5,mapping=aes(fill=factor),position=bar_position) if fill_params is not None: plot=plot+scale_fill_manual(**fill_params) if not usd: if (ymax is not None) : plot=plot+scale_y_continuous(limits=(None,ymax)) else: if (ymin is not None) : plot=plot+scale_y_continuous(limits=(ymin,None)) if ymax is None: ymax=data[column].max() if ymin is None: ymin=data[column].min() # print(ymax) plot = plot + geom_seq_x(seqimg=shaded.img,\ xlim=(1,sl+rof),ylim=(ymin,ymax),usd=usd,aspect_ratio=ar,transparent=transparent)+ggtitle(title) return plot
def quick_color_check(target_matrix, source_matrix, num_chips): """ Quickly plot target matrix values against source matrix values to determine over saturated color chips or other issues. Inputs: source_matrix = a 22x4 matrix containing the average red value, average green value, and average blue value for each color chip of the source image target_matrix = a 22x4 matrix containing the average red value, average green value, and average blue value for each color chip of the target image num_chips = number of color card chips included in the matrices (integer) :param source_matrix: numpy.ndarray :param target_matrix: numpy.ndarray :param num_chips: int """ # Imports from plotnine import ggplot, geom_point, geom_smooth, theme_seaborn, facet_grid, geom_label, scale_x_continuous, \ scale_y_continuous, scale_color_manual, aes import pandas as pd # Extract and organize matrix info tr = target_matrix[:num_chips, 1:2] tg = target_matrix[:num_chips, 2:3] tb = target_matrix[:num_chips, 3:4] sr = source_matrix[:num_chips, 1:2] sg = source_matrix[:num_chips, 2:3] sb = source_matrix[:num_chips, 3:4] # Create columns of color labels red = [] blue = [] green = [] for i in range(num_chips): red.append('red') blue.append('blue') green.append('green') # Make a column of chip numbers chip = np.arange(0, num_chips).reshape((num_chips, 1)) chips = np.row_stack((chip, chip, chip)) # Combine info color_data_r = np.column_stack((sr, tr, red)) color_data_g = np.column_stack((sg, tg, green)) color_data_b = np.column_stack((sb, tb, blue)) all_color_data = np.row_stack((color_data_b, color_data_g, color_data_r)) # Create a dataframe with headers dataset = pd.DataFrame({'source': all_color_data[:, 0], 'target': all_color_data[:, 1], 'color': all_color_data[:, 2]}) # Add chip numbers to the dataframe dataset['chip'] = chips dataset = dataset.astype({'color': str, 'chip': str, 'target': float, 'source': float}) # Make the plot p1 = ggplot(dataset, aes(x='target', y='source', color='color', label='chip')) + \ geom_point(show_legend=False, size=2) + \ geom_smooth(method='lm', size=.5, show_legend=False) + \ theme_seaborn() + facet_grid('.~color') + \ geom_label(angle=15, size=7, nudge_y=-.25, nudge_x=.5, show_legend=False) + \ scale_x_continuous(limits=(-5, 270)) + scale_y_continuous(limits=(-5, 275)) + \ scale_color_manual(values=['blue', 'green', 'red']) # Reset debug if params.debug is not None: if params.debug == 'print': p1.save(os.path.join(params.debug_outdir, 'color_quick_check.png')) elif params.debug == 'plot': print(p1)
def plot_char_percent_vs_accuracy_smooth(self, expo=False, no_models=False, columns=False): if expo: if os.path.exists('data/external/all_human_gameplay.json' ) and not self.no_humans: with open('data/external/all_human_gameplay.json') as f: all_gameplay = json.load(f) frames = [] for event, name in [('parents', 'Dilettante'), ('maryland', 'Expert'), ('live', 'National')]: if self.merge_humans: name = 'Human' gameplay = all_gameplay[event] if event != 'live': control_correct_positions = gameplay[ 'control_correct_positions'] control_wrong_positions = gameplay[ 'control_wrong_positions'] control_positions = control_correct_positions + control_wrong_positions control_positions = np.array(control_positions) control_result = np.array( len(control_correct_positions) * [1] + len(control_wrong_positions) * [0]) argsort_control = np.argsort(control_positions) control_x = control_positions[argsort_control] control_sorted_result = control_result[ argsort_control] control_y = control_sorted_result.cumsum( ) / control_sorted_result.shape[0] control_df = pd.DataFrame({ 'correct': control_y, 'char_percent': control_x }) control_df['Dataset'] = 'Regular Test' control_df['Guessing_Model'] = f' {name}' frames.append(control_df) adv_correct_positions = gameplay[ 'adv_correct_positions'] adv_wrong_positions = gameplay['adv_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(control_positions) adv_result = np.array( len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum( ) / adv_sorted_result.shape[0] adv_df = pd.DataFrame({ 'correct': adv_y, 'char_percent': adv_x }) adv_df['Dataset'] = 'Round 1 - IR Interface' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) if len(gameplay['advneural_correct_positions']) > 0: adv_correct_positions = gameplay[ 'advneural_correct_positions'] adv_wrong_positions = gameplay[ 'advneural_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(control_positions) adv_result = np.array( len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum( ) / adv_sorted_result.shape[0] adv_df = pd.DataFrame({ 'correct': adv_y, 'char_percent': adv_x }) adv_df['Dataset'] = 'Round 2 - NN Interface' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) human_df = pd.concat(frames) if no_models: p = ggplot(human_df) + geom_line() else: df = self.char_plot_df if 1 not in self.rounds: df = df[df['Dataset'] != 'Round 1 - IR Interface'] if 2 not in self.rounds: df = df[df['Dataset'] != 'Round 2 - IR Interface'] df = df[df['Dataset'] != 'Round 2 - NN Interface'] p = ggplot(df) if os.path.exists('data/external/all_human_gameplay.json' ) and not self.no_humans: eprint('Loading human data') p = p + geom_line(data=human_df) if columns: facet_conf = facet_wrap('Guessing_Model', ncol=1) else: facet_conf = facet_wrap('Guessing_Model', nrow=1) if not no_models: if self.mvg_avg_char: chart = stat_smooth(method='mavg', se=False, method_args={'window': 400}) else: chart = stat_summary_bin(fun_data=mean_no_se, bins=20, shape='.') else: chart = None p = (p + facet_conf + aes(x='char_percent', y='correct', color='Dataset')) if chart is not None: p += chart p = ( p + scale_y_continuous(breaks=np.linspace(0, 1, 11)) + scale_x_continuous(breaks=[0, .5, 1]) + xlab('Percent of Question Revealed') + ylab('Accuracy') + theme( #legend_position='top', legend_box_margin=0, legend_title=element_blank(), strip_text_x=element_text(margin={ 't': 6, 'b': 6, 'l': 1, 'r': 5 })) + scale_color_manual(values=['#FF3333', '#66CC00', '#3333FF'], name='Questions')) if self.title != '': p += ggtitle(self.title) return p else: return ( ggplot(self.char_plot_df) + aes(x='char_percent', y='correct', color='Guessing_Model') + stat_smooth( method='mavg', se=False, method_args={'window': 500}) + scale_y_continuous(breaks=np.linspace(0, 1, 21)))
def line_plot(df, x, y, group=None, facet_x=None, facet_y=None, aggfun='sum', err=None, show_points=False, base_size=10, figure_size=(6, 3)): ''' Aggregates data in df and plots multiple columns as a line chart. Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis y : str or list of str quoted expression(s) to be plotted on the y axis group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet aggfun : str or fun function to be used for aggregating (eg sum, mean, median ...) err : str quoted expression to be used as error shaded area show_points : bool show/hide markers base_size : int base size for theme_ez figure_size :tuple of int figure size Returns ------- g : EZPlot EZplot object ''' if group is not None and isinstance(y, list) and len(y) > 1: log.error( "groups can be specified only when a single y column is present") raise ValueError( "groups can be specified only when a single y column is present") if err is not None and isinstance(y, list) and len(y) > 1: log.error( "err can be specified only when a single y column is present") raise ValueError( "err can be specified only when a single y column is present") if isinstance(y, list) and len(y) == 1: y = y[0] # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x, group, facet_x, facet_y]): names[label], groups[label] = unname(var) # fix special cases if x == '.index': groups['x'] = '.index' names[ 'x'] = dataframe.index.name if dataframe.index.name is not None else '' if isinstance(y, list): ys = [] for i, var in enumerate(y): ys.append('y_{}'.format(i)) names['y_{}'.format(i)], variables['y_{}'.format(i)] = unname(var) # aggregate data tmp_gdata = agg_data(dataframe, variables, groups, aggfun, fill_groups=True) groups_present = [ c for c in ['x', 'facet_x', 'facet_y'] if c in tmp_gdata.columns ] gdata = pd.melt(tmp_gdata, groups_present, var_name='group', value_name='y') gdata['group'] = gdata['group'].replace( {var: names[var] for var in ys}) # update values for plotting names['y'] = 'Value' names['group'] = 'Variable' group = 'Variable' else: names['y'], variables['y'] = unname(y) if err is not None: names['err'], variables['err'] = unname(err) # aggregate data gdata = agg_data(dataframe, variables, groups, aggfun, fill_groups=True) # reorder columns gdata = gdata[[ c for c in ['x', 'y', 'err', 'group', 'facet_x', 'facet_y'] if c in gdata.columns ]] if err is not None: gdata['ymax'] = gdata['y'] + gdata['err'] gdata['ymin'] = gdata['y'] - gdata['err'] # init plot obj g = EZPlot(gdata) # set groups if group is None: g += p9.geom_line(p9.aes(x="x", y="y"), group=1, colour=ez_colors(1)[0]) if show_points: g += p9.geom_point(p9.aes(x="x", y="y"), group=1, colour=ez_colors(1)[0]) if err is not None: g += p9.geom_ribbon(p9.aes(x="x", ymax="ymax", ymin="ymin"), group=1, fill=ez_colors(1)[0], alpha=0.2) else: g += p9.geom_line( p9.aes(x="x", y="y", group="factor(group)", colour="factor(group)")) if show_points: g += p9.geom_point(p9.aes(x="x", y="y", colour="factor(group)")) if err is not None: g += p9.geom_ribbon(p9.aes(x="x", ymax="ymax", ymin="ymin", fill="factor(group)"), alpha=0.2) g += p9.scale_color_manual(values=ez_colors(g.n_groups('group'))) g += p9.scale_fill_manual(values=ez_colors(g.n_groups('group'))) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_timestamp('x'): g += p9.scale_x_datetime() elif g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab(names['y']) # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) return g
def marginal_plot(df, x, y, group = None, facet_x = None, facet_y = None, aggfun = 'sum', bins=21, use_quantiles = False, label_pos='auto', label_function=ez_labels, sort_groups=True, base_size=10, figure_size=(6, 3)): ''' Bin the data in a df and plot it using lines. Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis y : str quoted expression to be plotted on the y axis group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet aggfun : str or fun function to be used for aggregating (eg sum, mean, median ...) bins : int or tuple number of bins to be used use_quantiles : bool bin data using quantiles label_pos : str Use count label on each point. Choose between None, 'auto' or 'force' label_function : callable labelling function sort_groups : bool sort groups by the sum of their value (otherwise alphabetical order is used) base_size : int base size for theme_ez figure_size :tuple of int figure size Returns ------- g : EZPlot EZplot object ''' if label_pos not in [None, 'auto', 'force']: log.error("label_pos not recognized") raise NotImplementedError("label_pos not recognized") elif label_pos == 'auto': if bins<=21 and group is None: show_labels=True else: show_labels=False else: show_labels = True if label_pos=='force' else False # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x, group, facet_x, facet_y]): names[label], groups[label] = unname(var) names['y'], variables['y'] = unname(y) # set column names and evaluate expressions tmp_df = agg_data(dataframe, variables, groups, None, fill_groups=False) # redefine groups and variables; remove and store (eventual) names new_groups = {c:c for c in tmp_df.columns if c in ['x', 'group', 'facet_x', 'facet_y']} new_variables = {'y': 'y'} # bin data if use_quantiles: quantile_groups = [c for c in tmp_df.columns if c in ['group', 'facet_x', 'facet_y']] if len(quantile_groups)>0: tmp_df['x'] = tmp_df.groupby(quantile_groups)['x'].apply(lambda x: qbin_data(x, bins)) else: tmp_df['x'] = qbin_data(tmp_df['x'], bins) else: tmp_df['x'], _, _ = bin_data(tmp_df['x'], bins, None) # aggregate data and reorder columns gdata = agg_data(tmp_df, new_variables, new_groups, aggfun, fill_groups=False) # reorder columns gdata = gdata[[c for c in ['x', 'y', 'group', 'facet_x', 'facet_y'] if c in gdata.columns]] # init plot obj g = EZPlot(gdata) # determine order and create a categorical type if sort_groups: sort_data_groups(g) # get colors colors = np.flip(ez_colors(g.n_groups('group'))) # set groups if group is None: g += p9.geom_line(p9.aes(x="x", y="y"), group=1, colour=colors[0]) if show_labels: g += p9.geom_point(p9.aes(x="x", y="y"), group=1, colour=colors[0]) else: g += p9.geom_line(p9.aes(x="x", y="y", group="factor(group)", colour="factor(group)")) if show_labels: g += p9.geom_point(p9.aes(x="x", y="y", colour="factor(group)")) g += p9.scale_color_manual(values=colors) # set labels if show_labels: groups_to_count = [c for c in tmp_df.columns if c in ['x', 'group', 'facet_x', 'facet_y']] tmp_df['counts']=1 top_labels = tmp_df \ .groupby(groups_to_count)['counts'] \ .sum()\ .reset_index() top_labels['label'] = label_function(top_labels['counts']) # make sure labels and data can be joined for c in ['group', 'facet_x', 'facet_y']: if c in tmp_df.columns: try: top_labels[c] = pd.Categorical(top_labels[c].astype(str), categories = g.data[c].cat.categories, ordered = g.data[c].cat.ordered) except: pass #return g.data, top_labels g.data = pd.merge(g.data, top_labels, on=groups_to_count, how='left') g.data['label_pos'] = g.data['y'] + \ np.sign(g.data['y'])*g.data['y'].abs().max()*0.02 g += p9.geom_text(p9.aes(x='x', y='label_pos', label='label'), color="#000000", size=base_size * 0.7, ha='center', va='bottom') # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_timestamp('x'): g += p9.scale_x_datetime() elif g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab(names['y']) # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) return g
def quick_color_check(target_matrix, source_matrix, num_chips): """ Quickly plot target matrix values against source matrix values to determine over saturated color chips or other issues. Inputs: source_matrix = a 22x4 matrix containing the average red value, average green value, and average blue value for each color chip of the source image target_matrix = a 22x4 matrix containing the average red value, average green value, and average blue value for each color chip of the target image num_chips = number of color card chips included in the matrices (integer) :param source_matrix: numpy.ndarray :param target_matrix: numpy.ndarray :param num_chips: int """ # Imports from plotnine import ggplot, geom_point, geom_smooth, theme_seaborn, facet_grid, geom_label, scale_x_continuous, \ scale_y_continuous, scale_color_manual, aes import pandas as pd # Extract and organize matrix info tr = target_matrix[:num_chips, 1:2] tg = target_matrix[:num_chips, 2:3] tb = target_matrix[:num_chips, 3:4] sr = source_matrix[:num_chips, 1:2] sg = source_matrix[:num_chips, 2:3] sb = source_matrix[:num_chips, 3:4] # Create columns of color labels red = [] blue = [] green = [] for i in range(num_chips): red.append('red') blue.append('blue') green.append('green') # Make a column of chip numbers chip = np.arange(0, num_chips).reshape((num_chips, 1)) chips = np.row_stack((chip, chip, chip)) # Combine info color_data_r = np.column_stack((sr, tr, red)) color_data_g = np.column_stack((sg, tg, green)) color_data_b = np.column_stack((sb, tb, blue)) all_color_data = np.row_stack((color_data_b, color_data_g, color_data_r)) # Create a dataframe with headers dataset = pd.DataFrame({'source': all_color_data[:, 0], 'target': all_color_data[:, 1], 'color': all_color_data[:, 2]}) # Add chip numbers to the dataframe dataset['chip'] = chips dataset = dataset.astype({'color': str, 'chip': str, 'target': float, 'source': float}) # Make the plot p1 = ggplot(dataset, aes(x='target', y='source', color='color', label='chip')) + \ geom_point(show_legend=False, size=2) + \ geom_smooth(method='lm', size=.5, show_legend=False) + \ theme_seaborn() + facet_grid('.~color') + \ geom_label(angle=15, size=7, nudge_y=-.25, nudge_x=.5, show_legend=False) + \ scale_x_continuous(limits=(-5, 270)) + scale_y_continuous(limits=(-5, 275)) + \ scale_color_manual(values=['blue', 'green', 'red']) # Reset debug if params.debug is not None: if params.debug == 'print': p1.save(os.path.join(params.debug_outdir, 'color_quick_check.png')) elif params.debug == 'plot': print(p1)
def variable_histogram(df, x, group=None, facet_y=None, w='1', bins=21, bin_width=None, position='stack', normalize=False, base_size=10, figure_size=(6, 3)): ''' Plot a 1-d histogram Parameters ---------- df : pd.DataFrame input dataframe x : str or list quoted expressions to be plotted on the x axis group : str quoted expression to be used as group (ie color) facet_y : str quoted expression to be used as facet w : str quoted expression representing histogram weights (default is 1) bins : int or tuple number of bins to be used bin_width : float or tuple bin width to be used position : str if groups are present, choose between `stack`, `overlay` or `dodge` normalize : bool normalize histogram counts base_size : int base size for theme_ez figure_size :tuple of int figure size Returns ------- g : EZPlot EZplot object ''' # TODO: performance improvement # TODO: add support for categorical variables in x if position not in ['overlay', 'stack', 'dodge']: log.error("position not recognized") raise NotImplementedError("position not recognized") if (bins is None) and (bin_width is None): log.error("Either bins or bin_with should be defined") raise ValueError("Either bins or bin_with should be defined") if (bins is not None) and (bin_width is not None): log.error("Only one between bins or bin_with should be defined") raise ValueError( "Only one between bins or bin_with should be defined") if isinstance(x, str): x = [x] # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['group', 'facet_y'], [group, facet_y]): names[label], groups[label] = unname(var) xs = [] for i, var in enumerate(x): xs.append('x_{}'.format(i)) names['x_{}'.format(i)], groups['x_{}'.format(i)] = unname(var) names['w'], variables['w'] = unname(w) # set column names and evaluate expressions tmp_df = agg_data(dataframe, variables, groups, None, fill_groups=False) # redefine groups and variables; remove and store (eventual) names new_groups = { c: c for c in tmp_df.columns if c in ['group', 'facet_y'] + xs } non_x_groups = [g for g in new_groups.keys() if g not in xs] # bin data (if necessary) bins_x = {} bin_width_x = {} for x in xs: if tmp_df[x].dtypes != np.dtype('O'): tmp_df[x], bins_x[x], bin_width_x[x] = bin_data( tmp_df[x], bins, bin_width) else: bin_width_x[x] = 1 # aggregate data and reorder columns df_ls = [] for x in xs: # aggregate data groups = {g: g for g in non_x_groups} groups[x] = x single_df = agg_data(tmp_df, variables, groups, 'sum', fill_groups=True) single_df.fillna(0, inplace=True) single_df['facet_x'] = names[x] single_df.rename(columns={x: 'x'}, inplace=True) # normalize if normalize: if len(non_x_groups) == 0: single_df['w'] = single_df['w'] / (single_df['w'].sum() * bin_width_x[x]) else: single_df['w'] = single_df.groupby(non_x_groups)['w'].apply( lambda z: z / (z.sum() * bin_width_x[x])) df_ls.append(single_df) gdata = pd.concat(df_ls) gdata = gdata[[ c for c in ['x', 'w', 'group', 'facet_x', 'facet_y'] if c in gdata.columns ]] # start plotting g = EZPlot(gdata) # set groups for single_df in df_ls: if group is None: g += p9.geom_bar(p9.aes(x="x", y="w"), data=single_df, stat='identity', colour=None, fill=ez_colors(1)[0]) else: g += p9.geom_bar(p9.aes(x="x", y="w", group="factor(group)", fill="factor(group)"), data=single_df, colour=None, stat='identity', **POSITION_KWARGS[position]) g += p9.scale_fill_manual(values=ez_colors(g.n_groups('group'))) # set facets if facet_y is None: g += p9.facet_wrap('~facet_x', scales='free') else: g += p9.facet_grid('facet_y~facet_x', scales='free') # set x scale g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab('Value') + \ p9.ylab('Counts') # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) g += p9.guides(fill=p9.guide_legend(reverse=True)) return g
) # In[14]: from sklearn.calibration import calibration_curve cnn_y, cnn_x = calibration_curve(confidence_score_df.curated_dsh, confidence_score_df.uncal, n_bins=10) all_cnn_y, all_cnn_x = calibration_curve(confidence_score_df.curated_dsh, confidence_score_df.cal, n_bins=10) calibration_df = pd.DataFrame.from_records( list(map(lambda x: {"predicted":x[0], "actual": x[1], "model_calibration":'before'}, zip(cnn_x, cnn_y))) + list(map(lambda x: {"predicted":x[0], "actual": x[1], "model_calibration":'after'}, zip(all_cnn_x, all_cnn_y))) ) calibration_df.to_csv("output/dag_calibration.tsv", sep="\t", index=False) # In[15]: ( p9.ggplot(calibration_df, p9.aes(x="predicted", y="actual", color="model_calibration")) + p9.geom_point() + p9.geom_line(p9.aes(group="factor(model_calibration)")) + p9.geom_abline(intercept=0, slope=1, linetype='dashed') + p9.scale_y_continuous(limits=[0,1]) + p9.scale_x_continuous(limits=[0,1]) + p9.theme_bw() )
def main(): """Run CLI.""" parser = argparse.ArgumentParser(description=""" Fits logistic regression to predict labels.' """) parser.add_argument( '-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) parser.add_argument( '-h5', '--h5_anndata', action='store', dest='h5', required=True, help='H5 AnnData file where clusters have been saved to cluster slot.') # parser.add_argument( # '-ncpu', '--number_cpu', # action='store', # dest='number_cpu', # default=50, # type=int, # help='Number of CPUs to use. Since we are testing the dask backend,\ # this corresponds to the number of CPUs available across all of\ # the worker jobs we spin out.\ # (default: %(default)s)' # ) parser.add_argument('-s', '--sparsity_l1', action='store', dest='sparsity_l1', default=0.0001, type=float, help='Smaller values specify stronger regularization.\ (default: %(default)s)') parser.add_argument('-nepoch', '--number_epoch', action='store', dest='number_epoch', default=25, type=int, help='Number of epochs.\ (default: %(default)s)') parser.add_argument( '-bs', '--batch_size', action='store', dest='batch_size', default=32, type=int, help='Batch size. Divides the dataset into n batches and updates the\ weights at the end of each one.\ (default: %(default)s)') parser.add_argument( '-tsc', '--train_size_cells', action='store', dest='train_size_cells', default=0, type=int, help='Number of cells to use for training set. If > 0 all\ remaining cells not randomly selected for training will be used\ for the test set. Overrides <train_size_fraction>.\ (default: %(default)s)') parser.add_argument('-tsf', '--train_size_fraction', action='store', dest='train_size_fraction', default=0.67, type=float, help='Fraction of the data to use for training set.\ (default: %(default)s)') parser.add_argument( '--dict_add', action='store', dest='dict_add', default='', type=str, help='Additional information to add to output model_report.\ Format: key::value:::key2::value2.\ Example: method::leiden:::resolution::3.0\ (default: %(default)s)') parser.add_argument('--grid_search', action='store_true', dest='grid_search', default=False, help='Run a grid search of hyperparameters.\ (default: %(default)s)') parser.add_argument('--memory_limit', action='store', dest='memory_limit', default=50, type=int, help='Memory limit in Gb.\ (default: %(default)s)') parser.add_argument( '-of', '--output_file', action='store', dest='of', default='', help='Basename of output files, assuming output in current working \ directory.\ (default: keras_model-<params>)') options = parser.parse_args() verbose = True # Set GPU memory limits gpus = tf.config.list_physical_devices('GPU') print(gpus) if gpus: # For TF v1 # config = tf.ConfigProto() # config.gpu_options.allow_growth = True # session = tf.Session(config=config) # For TF v2 try: # Method 1: # Currently, memory growth needs to be the same across GPUs for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) # Method 2: # Restrict TensorFlow to only allocate 1GB of memory on the first # GPU # tf.config.experimental.set_virtual_device_configuration( # gpus[0], # [tf.config.experimental.VirtualDeviceConfiguration( # memory_limit=options.memory_limit*1024 # )]) # logical_gpus = tf.config.list_logical_devices('GPU') # print( # len(gpus), # "Physical GPUs,", # len(logical_gpus), # "Logical GPUs" # ) except RuntimeError as e: # Virtual devices must be set before GPUs have been initialized print(e) else: raise Exception('ERROR: no GPUs detected.') # Get additional data we are going to append to the output model info dict_add = {} if options.dict_add != '': for item in options.dict_add.split(':::'): _tmp = item.split('::') if len(_tmp) != 2: raise Exception('ERROR: check dict_add.') else: dict_add[_tmp[0]] = _tmp[1] print(dict_add) # Load the AnnData file. # This file should already have clusters identified and saved to the # clusters slot. adata = sc.read_h5ad(filename=options.h5) # Set X to cp10k # adata.X = np.expm1(adata.layers['log1p_cp10k']) # Set X to ln(cp10k+1) # NOTE: Testing with 100k TI dataset, we were able to achieve higher # accuracy with log1p_cp10k - likely becuase better spread in distribution. adata.X = adata.layers['log1p_cp10k'] # Set X to raw counts # adata.X = adata.layers['counts'] # Add some info from adata to dict_add for key, value in adata.uns['neighbors']['params'].items(): dict_add['neighbors__{}'.format(key)] = value for key, value in adata.uns['cluster']['params'].items(): dict_add['cluster__{}'.format(key)] = value # If train_size_cells, override the fraction so that the total number of # cells in the training set will be equal to train_size_cells. train_size_fraction = options.train_size_fraction if options.train_size_cells > 0: if options.train_size_cells >= adata.n_obs: raise Exception('Invalid train_size_cells.') train_size_fraction = ( 1 - ((adata.n_obs - options.train_size_cells) / adata.n_obs)) if verbose: print( 'Set train_size_fraction to: {}.'.format(train_size_fraction)) if verbose: print('Number cells training ({}) and testing ({}).'.format( int(train_size_fraction * adata.n_obs), int((1 - train_size_fraction) * adata.n_obs))) # Set X and y X = adata.X y = adata.obs['cluster'].values # Set other variables sparsity_l1 = options.sparsity_l1 n_epochs = options.number_epoch batch_size = options.batch_size # Center and scale the data if sp.sparse.issparse(X): X = X.todense() X_std = X scaler = preprocessing.StandardScaler(with_mean=True, with_std=True) X_std = scaler.fit_transform(X) if verbose: print('center={} scale={}'.format(True, True)) # One hot encode y (the cell type classes) # encode class values as integers encoder = preprocessing.LabelEncoder() encoder.fit(y) print('Found {} clusters'.format(len(encoder.classes_))) # Define the model # NOTE: Defaults determined via grid search of 160k TI single cells def classification_model(optimizer='sgd', activation='softmax', loss='categorical_crossentropy', sparsity_l1__activity=0.0001, sparsity_l2__activity=0.0, sparsity_l1__kernel=0.0, sparsity_l2__kernel=0.0, sparsity_l1__bias=0.0, sparsity_l2__bias=0.0): # create model model = Sequential() # Use a “softmax” activation function in the output layer. This is to # ensure the output values are in the range of 0 and 1 and may be used # as predicted probabilities. # # https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/softmax # Softmax assigns decimal probabilities to each class in a multi-class # problem. Those decimal probabilities must add up to 1.0. This # additional constraint helps training converge more quickly than it # otherwise would. Softmax is implemented through a neural network # layer just before the output layer. The Softmax layer must have the # same number of nodes as the output layer. # Softmax assumes that each example is a member of exactly one class. # # Softmax should be used for multi-class prediction with single label # https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/video-lecture # NOTE: input dimension = number of features your data has model.add( Dense( len(encoder.classes_), # output dim is number of classes use_bias=True, # intercept activation=activation, # softmax, sigmoid activity_regularizer=L1L2(l1=sparsity_l1__activity, l2=sparsity_l2__activity), kernel_regularizer=L1L2(l1=sparsity_l1__kernel, l2=sparsity_l2__kernel), bias_regularizer=L1L2(l1=sparsity_l1__bias, l2=sparsity_l2__bias), input_dim=X.shape[1])) # Example of adding additional layers # model.add(Dense(8, input_dim=4, activation='relu')) # model.add(Dense(3, activation='softmax')) # Metrics to check out over training epochs mets = [ # loss, keras.metrics.CategoricalAccuracy(name='categorical_accuracy'), # keras.metrics.TruePositives(name='tp'), # keras.metrics.FalsePositives(name='fp'), # keras.metrics.TrueNegatives(name='tn'), # keras.metrics.FalseNegatives(name='fn'), # keras.metrics.Precision(name='precision'), # keras.metrics.Recall(name='recall'), # keras.metrics.AUC(name='auc'), keras.metrics.BinaryAccuracy(name='accuracy') ] # Use Adam gradient descent optimization algorithm with a logarithmic # loss function, which is called “categorical_crossentropy” in Keras. # UPDATE: sgd works better emperically. model.compile( optimizer=optimizer, # adam, sgd loss=loss, metrics=mets) return model # Now, either call a grid search or specific model fit if options.grid_search: # Get the out file base. out_file_base = options.of if out_file_base == '': out_file_base = 'keras_model' out_file_base = '{}-grid_search'.format(out_file_base) # Call grid search of various parameters grid_result, df_grid_result = keras_grid( model_function=classification_model, encoder=encoder, X_std=X_std, y=y, n_epochs=n_epochs, batch_size=batch_size) # NOTE: This will fail because can't pickle KerasClassifier. This is # fine though becuase results are saved in tsv.gz format below. # Save the results # out_f = '{}-grid_result.gz'.format(out_file_base) # joblib.dump( # grid_result, # out_f, # compress=('gzip', 3) # ) # Load the model # lr = joblib.load( # 'test-lr_model.joblib.gz' # ) # print(lr) # Save the results of our search to tsv out_f = '{}-grid_result.tsv.gz'.format(out_file_base) df_grid_result.to_csv(out_f, sep='\t', index=False, quoting=csv.QUOTE_NONNUMERIC, na_rep='', compression=compression_opts) # Add a single columns that summarizes params param_columns = [ col for col in df_grid_result.columns if 'param__' in col ] df_grid_result['params'] = df_grid_result[param_columns].astype( str).apply(lambda x: '-'.join(x), axis=1) # Plot the distribution of accuracy across folds split_columns = [ col for col in df_grid_result.columns if 'split' in col ] split_columns = [col for col in split_columns if '_test_score' in col] df_plt = pd.melt(df_grid_result, id_vars=['params'], value_vars=split_columns) gplt = plt9.ggplot(df_plt, plt9.aes(x='params', y='value')) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_boxplot(alpha=0.8) gplt = gplt + plt9.geom_jitter(alpha=0.75) gplt = gplt + plt9.scale_y_continuous( # trans='log10', # labels=comma_labels, minor_breaks=0 # limits=[0, 1] ) gplt = gplt + plt9.labs(x='Parameters', y='Score', title='') gplt = gplt + plt9.theme( axis_text_x=plt9.element_text(angle=-45, hjust=0)) gplt.save('{}-score.png'.format(out_file_base), dpi=300, width=10, height=4, limitsize=False) # Plot the mean time and std err for fitting results gplt = plt9.ggplot(df_grid_result, plt9.aes(x='params', y='mean_fit_time')) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_point() gplt = gplt + plt9.geom_errorbar(plt9.aes( ymin='mean_fit_time-std_fit_time', ymax='mean_fit_time+std_fit_time'), width=0.2, position=plt9.position_dodge(0.05)) gplt = gplt + plt9.scale_y_continuous( # trans='log10', # labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.labs(x='Parameters', y='Mean fit time', title='') gplt = gplt + plt9.theme( axis_text_x=plt9.element_text(angle=-45, hjust=0)) gplt.save('{}-fit_time.png'.format(out_file_base), dpi=300, width=10, height=4, limitsize=False) else: # Get the out file base. out_file_base = options.of if out_file_base == '': out_file_base = 'keras_model' # out_file_base = '{}-center={}-scale={}'.format( # out_file_base, # center, # scale # ) out_file_base = '{}-batch_size={}-epochs={}'.format( out_file_base, batch_size, n_epochs) out_file_base = '{}-sparsity_l1={}-train_size_fraction={}'.format( out_file_base, str(sparsity_l1).replace('.', 'pt'), str(train_size_fraction).replace('.', 'pt')) # Fit the specific model and save the results model, model_report, y_prob_df, history = fit_model_keras( model_function=classification_model, encoder=encoder, X_std=X_std, y=y, sparsity_l1=sparsity_l1, sparsity_l2=0.0, n_epochs=n_epochs, batch_size=batch_size, train_size_fraction=train_size_fraction) # Save the model, weights (coefficients), and bias (intercept) model.save('{}.h5'.format(out_file_base), overwrite=True, include_optimizer=True) # Save the model and weights (coefficients) seperately # open('{}.json'.format(out_file_base), 'w').write(model.to_json()) open('{}.yml'.format(out_file_base), 'w').write(model.to_yaml()) model.save_weights('{}-weights.h5'.format(out_file_base)) # Example read functions # model = model_from_yaml(open('my_model_architecture.yaml').read()) # model.load_weights('my_model_weights.h5') # Save the model report # Add column telling us if this is cluster or summary value is_cluster = [] for i in model_report.index: if i in encoder.classes_: is_cluster.append(True) else: is_cluster.append(False) model_report['is_cluster'] = is_cluster # Add in extra data model_report['sparsity_l1'] = sparsity_l1 if dict_add: for key, value in dict_add.items(): model_report[key] = value print(model_report) out_f = '{}-model_report.tsv.gz'.format(out_file_base) model_report.to_csv(out_f, sep='\t', index=True, index_label='cell_label', quoting=csv.QUOTE_NONNUMERIC, na_rep='', compression=compression_opts) if verbose: print('Completed: save {}.'.format(out_f)) # Save the test results - each row is a cell and the columns are the # prob of that cell belonging to a particular class. # Add in extra data y_prob_df['sparsity_l1'] = sparsity_l1 if dict_add: for key, value in dict_add.items(): y_prob_df[key] = value out_f = '{}-test_result.tsv.gz'.format(out_file_base) y_prob_df.to_csv( out_f, sep='\t', index=False, # NOTE: Not adding the label to test_result index. # index_label='cell_label', quoting=csv.QUOTE_NONNUMERIC, na_rep='', compression=compression_opts) if verbose: print('Completed: save {}.'.format(out_f)) # Make a matrix of weights per gene # Columns = genes tested and rows = cell type label weight, bias = model.layers[-1].get_weights() # weight, bias = model.get_layer("output").get_weights() df_weights = pd.DataFrame.from_records( weight, index=adata.var.index, # index is gene columns=encoder.classes_) # Save the weights dataframe. out_f = '{}-weights.tsv.gz'.format(out_file_base) df_weights.to_csv(out_f, sep='\t', index=True, index_label='ensembl_gene_id', quoting=csv.QUOTE_NONNUMERIC, na_rep='', compression=compression_opts) if verbose: print('Completed: save {}.'.format(out_f)) # Plot the number of features with non-zero coefficients in each # cluster. out_f = '{}-n_features.png'.format(out_file_base) df_plt = pd.DataFrame({ 'classes': df_weights.columns, 'features': (df_weights != 0).sum(axis=0) }) df_plt = df_plt.set_index('classes') # print(df_plt) # Add in catgories with no predictive model (e.g., becuase they were # too few in training). for i in adata.obs['cluster'].cat.categories: if i not in df_plt.index: df_plt = df_plt.append( pd.Series([0], index=df_plt.columns, name=i)) fig = plt.figure(figsize=(max(0.5 * len(df_plt.index), 5), 4)) # plt.bar(lr.classes_, n_features) plt.bar(df_plt.index, df_plt['features']) plt.xlabel('Cluster') plt.ylabel('Features with coefficient != 0') plt.xticks(rotation=90) for i in df_plt.index: plt.annotate(str(df_plt.loc[i, 'features']), xy=(i, df_plt.loc[i, 'features'])) fig.savefig(out_f, dpi=300, bbox_inches='tight') plt.close(fig) # Plot ROC of the test and truth. out_f = '{}-roc.png'.format(out_file_base) fig = plt.figure() cell_label_true = y_prob_df.pop('cell_label_true') # Drop columns that are not cell type labels for i in y_prob_df.columns: if 'class__' not in i: del y_prob_df[i] plot_roc(y_prob_df.values, cell_label_true.values, y_prob_df.columns) fig.savefig(out_f, dpi=300, bbox_inches='tight') plt.close(fig) if verbose: print('Completed: save {}.'.format(out_f)) # Plot metrics vs cluster size to see if smaller clusters have poorer # metric measures. df_plt = model_report.fillna(0) for i in df_plt.index: if i not in encoder.classes_: df_plt = df_plt.drop(i) for i in ['AUC', 'f1-score', 'average_precision_score', 'MCC']: out_f = '{}-cluster_size_{}.png'.format(out_file_base, i) fig = plt.figure() plt.scatter(df_plt['n_cells_full_dataset'], df_plt[i], alpha=0.5) plt.xlabel('Number of cells in cluster (full dataset)') plt.ylabel(i) if i in ['AUC', 'f1-score', 'average_precision_score']: plt.ylim(0, 1) elif i == 'MCC': plt.ylim(-1, 1) # Add annotation of the cluster for index, row in df_plt.iterrows(): if row['n_cells_full_dataset'] == 0: print('ERROP: n_cells_full_dataset = 0 for {}.'.format( index)) plt.annotate( index, # this is the text (row['n_cells_full_dataset'], row[i]), # point to label textcoords='offset points', # how to position the text xytext=(0, 10), # distance from text to points (x,y) ha='center' # horiz alignment can be left, right, center ) fig.savefig(out_f, dpi=300, bbox_inches='tight') plt.xscale('log', basex=10) fig.savefig('{}-cluster_size_{}_log10.png'.format( out_file_base, i), dpi=300, bbox_inches='tight') plt.close(fig) if verbose: print('Completed: save {}.'.format(out_f)) # Plot history of metrics over epochs for dat_i in history.history.keys(): fig = plt.figure() plt.plot(history.history[dat_i]) plt.ylabel(dat_i) plt.xlabel('Epoch') fig.savefig('{}-model_iter_{}.png'.format(out_file_base, dat_i), dpi=300, bbox_inches='tight') plt.close(fig)
def main(): """Run CLI.""" parser = argparse.ArgumentParser(description=""" Filter and merge 10x data. Save to AnnData object. """) parser.add_argument( '-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) parser.add_argument('--tsv_file', action='store', dest='tsv', required=True, help='cell_filtered_per_experiment tsv file.') parser.add_argument( '-of', '--output_file', action='store', dest='of', default='', help='Basename of output png file. Will have .png appended.\ (default: %(default)s)') options = parser.parse_args() # Get basename of the output file out_file_base = options.of if out_file_base == '': out_file_base = '{}'.format( os.path.basename(options.tsv.rstrip('tsv.gz').rstrip('\\.'))) # Load the data df = pd.read_csv(options.tsv, sep='\t') # Get the total number of input cells per sample df_before_filters = df[df.filter_type.isin(['before_filters'])] df_before_filters = df_before_filters.set_index('experiment_id') # Check if any difference between before and after filters. If not, # return early. df_after_filters = df[df.filter_type.isin(['after_filters'])] filt = df_after_filters.n_cells_left_in_adata == df_before_filters.loc[ df_after_filters.experiment_id, 'n_cells_left_in_adata'].values if all(filt): print("No difference detected before and after filters. No plots.") return () # Set some plotting parameters plt_height = 16 # 1.5 * df.experiment_id.nunique() # Plot the number of cells before and after all filters across experiments df_plt = df[df.filter_type.isin(['before_filters', 'after_filters'])] gplt = plt9.ggplot( df_plt, plt9.aes( x='experiment_id', y='n_cells_left_in_adata', # label='n_cells', fill='filter_type')) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_bar(stat='identity', position='dodge') # gplt = gplt + plt9.geom_text(vjust=1.6, color='white', size=3.5) gplt = gplt + plt9.scale_y_continuous( trans='log10', labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual') gplt = gplt + plt9.labs(title='', y='Number of cells', x='', fill='') # NOTE: legend_position bug https://github.com/has2k1/plotnine/issues/245 gplt = gplt + plt9.theme( # legend_position='bottom', subplots_adjust={'bottom': 0.15}, legend_position=(.5, .05), legend_direction='horizontal', legend_title=plt9.element_blank()) gplt = gplt + plt9.coord_flip() gplt.save('{}-n_cells_before_after.png'.format(out_file_base), dpi=300, width=4, height=plt_height) # Plot the final fraction of cells filtered per experiment df_plt = df_after_filters.copy() # Invert the numbers, so instead of the number of cells that pass, get # the number of cells that fail at each filter. df_plt.n_cells_left_in_adata = df_before_filters.loc[ df_plt.experiment_id, 'n_cells_left_in_adata'].values - df_plt.n_cells_left_in_adata # Now calculate the fraction removed df_plt['fraction_cells'] = df_plt.n_cells_left_in_adata / \ df_before_filters.loc[ df_plt.experiment_id, 'n_cells_left_in_adata' ].values gplt = plt9.ggplot( df_plt, plt9.aes(x='experiment_id', y='fraction_cells', fill='filter_type')) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_bar(stat='identity', position='dodge') if df_plt.filter_type.nunique() < 9: gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual') gplt = gplt + plt9.labs( title='', y='Fraction of total cells excluded', x='', fill='Filter') # NOTE: legend_position bug https://github.com/has2k1/plotnine/issues/245 gplt = gplt + plt9.theme( # legend_position='bottom', subplots_adjust={'bottom': 0.15}, legend_position=(.5, .05), legend_direction='vertical') gplt = gplt + plt9.coord_flip() gplt.save('{}-fraction_before_after.png'.format(out_file_base), dpi=300, width=4, height=plt_height) # Plot the number of cells falling into each filter acoss experiments. # NOTE: cells can fall into multiple filters. # Remove the rows that we do not want df_plt = df[~df.filter_type.isin(['before_filters', 'after_filters'])] df_plt = df_plt[~df_plt.filter_type.str.contains('after_filter')] # Invert the numbers, so instead of the number of cells that pass, get # the number of cells that fail at each filter. df_plt.n_cells_left_in_adata = df_before_filters.loc[ df_plt.experiment_id, 'n_cells_left_in_adata'].values - df_plt.n_cells_left_in_adata gplt = plt9.ggplot( df_plt, plt9.aes(x='experiment_id', y='n_cells_left_in_adata', fill='filter_type')) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_bar(stat='identity', position='dodge') if df_plt.filter_type.nunique() < 9: gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual') gplt = gplt + plt9.labs( title='', y='Number of cells excluded', x='', fill='Filter') # NOTE: legend_position bug https://github.com/has2k1/plotnine/issues/245 gplt = gplt + plt9.theme( # legend_position='bottom', subplots_adjust={'bottom': 0.15}, legend_position=(.5, .05), legend_direction='vertical') gplt = gplt + plt9.coord_flip() gplt.save('{}-n_cells_excluded.png'.format(out_file_base), dpi=300, width=4, height=plt_height) # Plot the ratio of the total number of cells removed in each filter across # experiments. # NOTE: cells can fall into multiple filters. df_plt['fraction_cells'] = df_plt.n_cells_left_in_adata / \ df_before_filters.loc[ df_plt.experiment_id, 'n_cells_left_in_adata' ].values gplt = plt9.ggplot( df_plt, plt9.aes(x='experiment_id', y='fraction_cells', fill='filter_type')) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_bar(stat='identity', position='dodge') if df_plt.filter_type.nunique() < 9: gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual') gplt = gplt + plt9.labs( title='', y='Fraction of total cells excluded', x='', fill='Filter') # NOTE: legend_position bug https://github.com/has2k1/plotnine/issues/245 gplt = gplt + plt9.theme( # legend_position='bottom', subplots_adjust={'bottom': 0.15}, legend_position=(.5, .05), legend_direction='vertical') gplt = gplt + plt9.coord_flip() gplt.save('{}-fraction_cells_excluded.png'.format(out_file_base), dpi=300, width=4, height=plt_height)
gg_rep_act.save(os.path.join(dir_output, 'gg_rep_act.png'), width=8, height=4) di_notes = { 'chi2': 'χ2-correction', 'insig': 'Erroneous', 'specification': 'Specification', 'non-replicable': 'Inconsistent' } # (ii) Breakdown of counts tmp = acc_tt.merge( res_fisher.tt.value_counts().reset_index().rename(columns={ 'index': 'tt', 'tt': 'n_lit' })) tmp = tmp.assign(tt=lambda x: x.tt.map(di_tt), notes=lambda x: x.notes.map(di_notes), share=lambda x: x.n / x.n_lit) gg_acc_notes = ( pn.ggplot(tmp, pn.aes(x='notes', y='share', fill='tt')) + pn.theme_bw() + pn.scale_y_continuous(labels=percent_format(), limits=[0, 0.1]) + pn.scale_fill_discrete(name='Literature') + pn.geom_col(color='black', position=pn.position_dodge(0.5), width=0.5) + pn.labs(y='Percent', x='Investigation') + pn.theme(axis_text_x=pn.element_text(angle=45), axis_title_x=pn.element_blank())) gg_acc_notes.save(os.path.join(dir_output, 'gg_acc_notes.png'), width=7, height=3) print('~~~ End of 4_results_insig.py ~~~')
df_3.to_csv('/home/treelab/Documents/CUDAGP/script_GP1/graphs/mean_%s_%s.csv' % (df_new_3['popsize'][0], df_new_3['indsize'][0])) try: df_4 = df_new_4.groupby(['nrow', 'nvar'])['timewr'].mean() df_4.to_csv( '/home/treelab/Documents/CUDAGP/script_GP1/graphs/mean_%s_%s.csv' % (df_new_4['popsize'][0], df_new_4['indsize'][0])) except: print 'error' for ielem in (df_new_1, df_new_2, df_new_3, df_new_4): surveys_plot = ( p9.ggplot(data=ielem, mapping=p9.aes(x='run', y='timewr', color='factor(nvar)')) + p9.geom_point() + p9.facet_grid("~nrow") + p9.scale_y_continuous(limits=(0, 500)) + p9.scale_x_discrete(breaks=range(0, 35, 5)) + p9.theme(text=p9.element_text(size=10, family="serif"), plot_title=p9.element_text(weight='bold', size=14), legend_title=p9.element_text(weight='bold', size=14), legend_text=p9.element_text(weight='bold', size=10), axis_title_y=p9.element_text(weight='bold', size=14), axis_title_x=p9.element_text(weight='bold', size=14)) + p9.labs(y='Time (s)', x='Number of run', title='Population Size [%s]' % ielem['popsize'][0], color='Features')) # Cambiar a la direccion donde quieres guardarlos surveys_plot.save("./data_%s_%s.pdf" % (ielem['popsize'][0], ielem['indsize'][0]), width=11,
def htcalc(air_velocity_inside, air_velocity_outside, t_inside, t_outside, surface, layers, wall_thickness, thermal_conductivity): # We need the convective heat resistance on both sides of the wall res_conv_inside = heattransfer.convective_resistance( heattransfer.heat_transfer_coef(air_velocity_inside), surface) res_conv_outside = heattransfer.convective_resistance( heattransfer.heat_transfer_coef(air_velocity_outside), surface) # We need the total resistance over all wall layers total_layer_resistance = [] total_layer_resistance.append(res_conv_inside) for i in range(layers): total_layer_resistance.append( heattransfer.conductive_resistance(wall_thickness[i], thermal_conductivity[i], surface)) total_layer_resistance.append(res_conv_outside) total_resistance = sum(total_layer_resistance) heat_transfer = heattransfer.conduction(t_inside, t_outside, total_resistance) # Calculating the temperatures between each layer temperatures = [] temperatures.append(t_inside) layer_resistance = 0 for resistance in total_layer_resistance: layer_resistance += resistance temperatures.append( heattransfer.layer_temperature(heat_transfer, layer_resistance, t_inside)) # Preparing the x axis, position of the temperature and transition labels for the graph position = [0, 0.02] labels = ['fluid inside', 'inner surface'] i = 0 for entry in wall_thickness: position.append(position[-1] + entry) i += 1 labels.append("layer" + str(i)) labels[-1] = "outer surface" position.append(position[-1] + 0.02) labels.append("fluid outside") # print(f"\nThe total resistance is {round(total_resistance, 2)} K/W") # print(f"Total heat transfer from inside to outside is {round(heat_transfer, 2)} W\n") df = pd.DataFrame({'pos': position, 'temp': temperatures}) gg = p9.ggplot(df, p9.aes(x='pos', y='temp')) gg += p9.geom_line(p9.aes(color='temp'), size=2) for ws in df.pos.values.tolist(): gg += p9.geom_vline(xintercept=ws, color='grey') # gg += p9.geom_hline(yintercept=110, color='red', size=2, alpha=0.8) gg += p9.ggtitle('heat transfer through wall') gg += p9.scale_x_continuous(name='Position', breaks=df.pos.values.tolist(), labels=labels) gg += p9.scale_y_continuous(name='Temperature') gg += p9.theme(axis_text_x=p9.element_text(angle=45)) gg += p9.scale_colour_gradient(low="yellow", high="orange") i = 0 for temp in temperatures: gg += p9.geom_text( p9.aes(x=position[i], y=temp + 30, label=round(temp, 2))) i += 1 for i in range(layers): labtext = 'Thermal cond.: ' + str( thermal_conductivity[i]) + ' [W/m°K]\nLayer thickness: ' + str( round(wall_thickness[i], 3)) + ' [m]' gg += p9.annotate(geom='text', x=((position[i + 2] - position[i + 1]) / 2) + position[i + 1], y=temperatures[i] + 30, label=labtext, color='blue') return gg
# Get the descriptive statistics for prices. print('\nTHE PRICE ATTRIBUTE HAS THE FOLLOWING STATISTCS:') print(prices.describe()) # Graph price distribution as a boxplot. prices_df = pd.DataFrame({ 'x' : ['']*len(prices), 'price' : prices }) gg.options.figure_size=(4, 6) g = ( gg.ggplot(data=prices_df) + gg.geom_boxplot(mapping=gg.aes(x='x', y='price')) + gg.theme_bw() + gg.ggtitle('Ranges of Prices Paid Across All Figures') + gg.xlab('') + gg.ylab('Price Paid') + gg.scale_y_continuous(labels=dollar_format(digits=0))) g.draw() plt.show() # Group figures by year and get counts per year. year_gb = fig_data.groupby('year') volume_per_year = year_gb.aggregate('count') volume_per_year.drop(0, inplace=True) # Plot a histogram of count of figures per year. mpl.rcParams['figure.figsize'] = [8.0, 6.0] mpl.rcParams['figure.dpi'] = 100 fig, ax = plt.subplots() # Create a graph plt.bar(x=volume_per_year.index, height=volume_per_year["figure_id"]) ax.set_title('Year of Production of Figures in Collection') # Title the chart ax.set_xlabel('Year of Release') # Title the x-axis
def hist_plot(df, x, y=None, group = None, facet_x = None, facet_y = None, w='1', bins=21, bin_width = None, position = 'stack', normalize = False, sort_groups=True, base_size=10, figure_size=(6, 3)): ''' Plot a 1-d or 2-d histogram Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis y : str quoted expression to be plotted on the y axis. If this is specified the histogram will be 2-d. group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet w : str quoted expression representing histogram weights (default is 1) bins : int or tuple number of bins to be used bin_width : float or tuple bin width to be used position : str if groups are present, choose between `stack`, `overlay` or `dodge` normalize : bool normalize histogram counts sort_groups : bool sort groups by the sum of their value (otherwise alphabetical order is used) base_size : int base size for theme_ez figure_size :tuple of int figure size Returns ------- g : EZPlot EZplot object ''' if position not in ['overlay', 'stack', 'dodge']: log.error("position not recognized") raise NotImplementedError("position not recognized") if (bins is None) and (bin_width is None): log.error("Either bins or bin_with should be defined") raise ValueError("Either bins or bin_with should be defined") if (bins is not None) and (bin_width is not None): log.error("Only one between bins or bin_with should be defined") raise ValueError("Only one between bins or bin_with should be defined") if (y is not None) and (group is not None): log.error("y and group cannot be requested at the same time") raise ValueError("y and group cannot be requested at the same time") if y is None: bins = (bins, bins) bin_width = (bin_width, bin_width) else: if type(bins) not in [tuple, list]: bins = (bins, bins) if type(bin_width) not in [tuple, list]: bin_width = (bin_width, bin_width) # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'y', 'group', 'facet_x', 'facet_y'], [x, y, group, facet_x, facet_y]): names[label], groups[label] = unname(var) names['w'], variables['w'] = unname(w) # set column names and evaluate expressions tmp_df = agg_data(dataframe, variables, groups, None, fill_groups=False) # redefine groups and variables; remove and store (eventual) names new_groups = {c:c for c in tmp_df.columns if c in ['x', 'y', 'group', 'facet_x', 'facet_y']} non_xy_groups = [g for g in new_groups.keys() if g not in ['x', 'y']] new_variables = {'w':'w'} # bin data (if necessary) if tmp_df['x'].dtypes != np.dtype('O'): tmp_df['x'], bins_x, bin_width_x= bin_data(tmp_df['x'], bins[0], bin_width[0]) else: bin_width_x=1 if y is not None: if tmp_df['y'].dtypes != np.dtype('O'): tmp_df['y'], bins_y, bin_width_y = bin_data(tmp_df['y'], bins[1], bin_width[1]) else: bin_width_y=1 else: bin_width_y=1 # aggregate data and reorder columns gdata = agg_data(tmp_df, new_variables, new_groups, 'sum', fill_groups=True) gdata.fillna(0, inplace=True) gdata = gdata[[c for c in ['x', 'y', 'w', 'group', 'facet_x', 'facet_y'] if c in gdata.columns]] # normalize if normalize: if len(non_xy_groups)==0: gdata['w'] = gdata['w']/(gdata['w'].sum()*bin_width_x*bin_width_y) else: gdata['w'] = gdata.groupby(non_xy_groups)['w'].apply(lambda x: x/(x.sum()*bin_width_x*bin_width_y)) # start plotting g = EZPlot(gdata) # determine order and create a categorical type if (group is not None) and sort_groups: if g.column_is_categorical('x'): g.sort_group('x', 'w', ascending=False) g.sort_group('group', 'w') g.sort_group('facet_x', 'w', ascending=False) g.sort_group('facet_y', 'w', ascending=False) if groups: colors = np.flip(ez_colors(g.n_groups('group'))) elif (group is not None): colors = ez_colors(g.n_groups('group')) if y is None: # set groups if group is None: g += p9.geom_bar(p9.aes(x="x", y="w"), stat = 'identity', colour = None, fill = ez_colors(1)[0]) else: g += p9.geom_bar(p9.aes(x="x", y="w", group="factor(group)", fill="factor(group)"), colour=None, stat = 'identity', **POSITION_KWARGS[position]) g += p9.scale_fill_manual(values=colors) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab('Counts') # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) if sort_groups: g += p9.guides(fill=p9.guide_legend(reverse=True)) else: g += p9.geom_tile(p9.aes(x="x", y="y", fill='w'), stat = 'identity', colour = None) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale if g.column_is_categorical('y'): g += p9.scale_y_discrete() else: g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab(names['y']) # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text='Counts', size=base_size)) return g
def estimate_cutoffs_plot(output_file, df_plt, df_estimate_ncells, df_fit=None, scale_x_log10=False, save_plot=True, add_text=False): """Plot UMI counts by sorted cell barcodes.""" if min(df_plt['umi_counts']) <= 0: fix_log_scale = min(df_plt['umi_counts']) + 1 df_plt['umi_counts'] = df_plt['umi_counts'] + fix_log_scale if add_text: df_estimate_ncells['add_text_y'] = np.random.randint( low=df_plt['umi_counts'].min() - 25, high=df_plt['umi_counts'].max() - 25, size=df_estimate_ncells.shape[0]) gplt = plt9.ggplot() gplt = gplt + plt9.theme_bw() if len(df_plt) <= 50000: gplt = gplt + plt9.geom_point(mapping=plt9.aes(x='barcode', y='umi_counts'), data=df_plt, alpha=0.05, size=0.1) else: gplt = gplt + plt9.geom_line(mapping=plt9.aes(x='barcode', y='umi_counts'), data=df_plt, alpha=0.25, size=0.75, color='grey') gplt = gplt + plt9.geom_vline(mapping=plt9.aes(xintercept='n_cells', color='method'), data=df_estimate_ncells, alpha=0.75, linetype='dashdot') if add_text: gplt = gplt + plt9.geom_text(mapping=plt9.aes( x='n_cells', y='add_text_y', label='n_cells', color='method'), data=df_estimate_ncells, alpha=0.75) gplt = gplt + plt9.scale_color_brewer(palette='Dark2', type='qual') if scale_x_log10: gplt = gplt + plt9.scale_x_continuous( trans='log10', labels=comma_labels, minor_breaks=0) else: gplt = gplt + plt9.scale_x_continuous(labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.scale_y_continuous( trans='log10', labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.labs(title='', y='UMI counts', x='Barcode index, sorted by UMI count', color='Cutoff') # Add the fit of the droplet utils model if df_fit: gplt = gplt + plt9.geom_line(mapping=plt9.aes(x='x', y='y'), data=df_fit, alpha=1, color='yellow') if save_plot: gplt.save('{}.png'.format(output_file), dpi=300, width=5, height=4) return gplt
def plot_it(molecules, t): counts = [0] * (x_hi - x_lo + 1) for mol_loc in molecules: counts[mol_loc] += 1 data = pd.DataFrame({"x": range(x_lo, x_hi + 1), "t": t, "count": counts}) return p9.geom_line(data=data, size=1) if __name__ == "__main__": molecule_locations = [100] * num_molecules my_plot = p9.ggplot(p9.aes(x="x", y="count", color="t")) start = time.perf_counter() intervals = partition(num_molecules, NUM_PROCESSES) with multiprocessing.Pool(NUM_PROCESSES) as pool: for t in range(0, 2001, PLOT_EVERY): my_plot += plot_it(molecule_locations, f"{t} ms") mol_parts = [molecule_locations[i[0]:i[1]] for i in intervals] molecule_locations = list( itertools.chain.from_iterable( pool.map(advance, [(mols, PLOT_EVERY) for mols in mol_parts]))) if t % PLOT_EVERY == 0: my_plot += plot_it(molecule_locations, f"{t} ms") stop = time.perf_counter() print(f"simulation time: {stop - start}") my_plot += p9.scale_y_continuous(limits=(0, 3_000)) my_plot.draw() plt.show()
def syntactic_diversity_plots(): with open('data/external/syntactic_diversity_table.json') as f: rows = json.load(f) parse_df = pd.DataFrame(rows) parse_df['parse_ratio'] = parse_df['unique_parses'] / parse_df['parses'] melt_df = pd.melt( parse_df, id_vars=['dataset', 'depth', 'overlap', 'parses'], value_vars=['parse_ratio', 'unique_parses'], var_name='metric', value_name='y' ) def label_facet(name): if name == 'parse_ratio': return 'Average Unique Parses per Instance' elif name == 'unique_parses': return 'Count of Unique Parses' def label_y(ys): formatted_ys = [] for y in ys: y = str(y) if y.endswith('000.0'): formatted_ys.append(y[:-5] + 'K') else: formatted_ys.append(y) return formatted_ys p = ( ggplot(melt_df) + aes(x='depth', y='y', color='dataset') + facet_wrap('metric', scales='free_y', nrow=2, labeller=label_facet) + geom_line() + geom_point() + xlab('Parse Truncation Depth') + ylab('') + scale_color_discrete(name='Dataset') + scale_y_continuous(labels=label_y) + scale_x_continuous( breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10]) + theme_fs() ) p.save(path.join(output_path, 'syn_div_plot.pdf')) p = ( ggplot(parse_df) + aes(x='depth', y='unique_parses', color='dataset') + geom_line() + geom_point() + xlab('Parse Truncation Depth') + ylab('Count of Unique Parses') + scale_color_discrete(name='Dataset') + scale_x_continuous( breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10]) + theme_fs() ) p.save(path.join(output_path, 'n_unique_parses.pdf')) p = ( ggplot(parse_df) + aes(x='depth', y='parse_ratio', color='dataset') + geom_line() + geom_point() + xlab('Parse Truncation Depth') + ylab('Average Unique Parses per Instance') + scale_color_discrete(name='Dataset') + scale_x_continuous(breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10]) + scale_y_continuous(limits=[0, 1]) + theme_fs() ) p.save(path.join(output_path, 'parse_ratio.pdf'))
def plot_char_percent_vs_accuracy_smooth(self, expo=False, no_models=False, columns=False): if self.y_max is not None: limits = [0, float(self.y_max)] eprint(f'Setting limits to: {limits}') else: limits = [0, 1] if expo: if os.path.exists('data/external/all_human_gameplay.json') and not self.no_humans: with open('data/external/all_human_gameplay.json') as f: all_gameplay = json.load(f) frames = [] for event, name in [('parents', 'Intermediate'), ('maryland', 'Expert'), ('live', 'National')]: if self.merge_humans: name = 'Human' gameplay = all_gameplay[event] if event != 'live': control_correct_positions = gameplay['control_correct_positions'] control_wrong_positions = gameplay['control_wrong_positions'] control_positions = control_correct_positions + control_wrong_positions control_positions = np.array(control_positions) control_result = np.array(len(control_correct_positions) * [1] + len(control_wrong_positions) * [0]) argsort_control = np.argsort(control_positions) control_x = control_positions[argsort_control] control_sorted_result = control_result[argsort_control] control_y = control_sorted_result.cumsum() / control_sorted_result.shape[0] control_df = pd.DataFrame({'correct': control_y, 'char_percent': control_x}) control_df['Dataset'] = 'Regular Test' control_df['Guessing_Model'] = f' {name}' frames.append(control_df) adv_correct_positions = gameplay['adv_correct_positions'] adv_wrong_positions = gameplay['adv_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(adv_positions) adv_result = np.array(len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0] adv_df = pd.DataFrame({'correct': adv_y, 'char_percent': adv_x}) adv_df['Dataset'] = 'IR Adversarial' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) if len(gameplay['advneural_correct_positions']) > 0: adv_correct_positions = gameplay['advneural_correct_positions'] adv_wrong_positions = gameplay['advneural_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(adv_positions) adv_result = np.array(len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0] adv_df = pd.DataFrame({'correct': adv_y, 'char_percent': adv_x}) adv_df['Dataset'] = 'RNN Adversarial' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) human_df = pd.concat(frames) human_vals = sort_humans(list(human_df['Guessing_Model'].unique())) human_dtype = CategoricalDtype(human_vals, ordered=True) human_df['Guessing_Model'] = human_df['Guessing_Model'].astype(human_dtype) dataset_dtype = CategoricalDtype(['Regular Test', 'IR Adversarial', 'RNN Adversarial'], ordered=True) human_df['Dataset'] = human_df['Dataset'].astype(dataset_dtype) if no_models: p = ggplot(human_df) + geom_point(shape='.') else: df = self.char_plot_df if 1 not in self.rounds: df = df[df['Dataset'] != 'Round 1 - IR Adversarial'] if 2 not in self.rounds: df = df[df['Dataset'] != 'Round 2 - IR Adversarial'] df = df[df['Dataset'] != 'Round 2 - RNN Adversarial'] p = ggplot(df) if self.save_df is not None: eprint(f'Saving df to: {self.save_df}') df.to_json(self.save_df) if os.path.exists('data/external/all_human_gameplay.json') and not self.no_humans: eprint('Loading human data') p = p + geom_line(data=human_df) if columns: facet_conf = facet_wrap('Guessing_Model', ncol=1) else: facet_conf = facet_wrap('Guessing_Model', nrow=1) if not no_models: if self.mvg_avg_char: chart = stat_smooth(method='mavg', se=False, method_args={'window': 400}) else: chart = stat_summary_bin(fun_data=mean_no_se, bins=20, shape='.', linetype='None', size=0.5) else: chart = None p = ( p + facet_conf + aes(x='char_percent', y='correct', color='Dataset') ) if chart is not None: p += chart p = ( p + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + scale_x_continuous(breaks=[0, .5, 1]) + coord_cartesian(ylim=limits) + xlab('Percent of Question Revealed') + ylab('Accuracy') + theme( #legend_position='top', legend_box_margin=0, legend_title=element_blank(), strip_text_x=element_text(margin={'t': 6, 'b': 6, 'l': 1, 'r': 5}) ) + scale_color_manual(values=['#FF3333', '#66CC00', '#3333FF', '#FFFF33'], name='Questions') ) if self.title != '': p += ggtitle(self.title) return p else: if self.save_df is not None: eprint(f'Saving df to: {self.save_df}') df.to_json(self.save_df) return ( ggplot(self.char_plot_df) + aes(x='char_percent', y='correct', color='Guessing_Model') + stat_smooth(method='mavg', se=False, method_args={'window': 500}) + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + coord_cartesian(ylim=limits) )
to_plot = 150 margin = 0.5 x_loc = 3 rcn = 0.1 optlin = 0.4 boundary_factor = b_from_error(optlin) train, _, _ = prep_dataset(to_plot, margin = margin, x_loc = x_loc, rcn = rcn, boundary_factor = boundary_factor) (ggplot(train, aes(x = 'x', y = 'y', color = 'group', shape = 'group')) + geom_point(size = 4, fill = 'none') + scale_shape_manual(values = ('o', 'P')) + geom_vline(xintercept = [-1*boundary_factor, 0, boundary_factor], linetype = 'dotted') + scale_x_continuous(breaks = np.arange(-6, 7, 1), name = '') + scale_y_continuous(name = '') + theme(legend_position='none') ) # Create a function which runs the experiment for a given learner and hyperparameter configuration. # In[9]: def experiment(learner, n_train, optlin, rcn, lr = 1e-2, train_batch_size = 1, epochs = 1, data_seed = 123, filestring = 'na', sgd_shuffle_seed = 123): boundary_factor = b_from_error(optlin) # Note that training batch size is important as it influences learned model.
def scatter_plot(df, x, y, group=None, facet_x=None, facet_y=None, base_size=10, figure_size=(6, 3), **kwargs): ''' Aggregates data in df and plots as a scatter plot chart. Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis y : str quoted expression to be plotted on the y axis group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet base_size : int base size for theme_ez figure_size :tuple of int figure size **kwargs: additional kwargs passed to geom_point Returns ------- g : EZPlot EZplot object ''' # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x, group, facet_x, facet_y]): names[label], groups[label] = unname(var) names['y'], variables['y'] = unname(y) # fix special cases if x == '.index': groups['x'] = '.index' names[ 'x'] = dataframe.index.name if dataframe.index.name is not None else '' # aggregate data and reorder columns gdata = agg_data(dataframe, variables, groups, None, fill_groups=True) gdata = gdata[[ c for c in ['x', 'y', 'group', 'facet_x', 'facet_y'] if c in gdata.columns ]] # add group_x column if group is not None: gdata['group_x'] = gdata['group'].astype( 'str') + '_' + gdata['x'].astype(str) g = EZPlot(gdata) # set groups if group is None: g += p9.geom_point(p9.aes(x="x", y="y"), colour=ez_colors(1)[0], **kwargs) else: g += p9.geom_point( p9.aes(x="x", y="y", group="factor(group)", color="factor(group)"), **kwargs) g += p9.scale_color_manual(values=ez_colors(g.n_groups('group'))) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_timestamp('x'): g += p9.scale_x_datetime() elif g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale if g.column_is_timestamp('y'): g += p9.scale_y_datetime() elif g.column_is_categorical('y'): g += p9.scale_y_discrete() else: g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab(names['y']) # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) return g
lambda x: x.aupr_mean - (critical_val * x.aupr_std) / pd.np.sqrt(x.lf_num_len) })) dev_set_stats_df # In[9]: (p9.ggplot(dev_set_stats_df, p9.aes(x="factor(lf_num)", y="auroc_mean", color="model")) + p9.geom_point() + p9.geom_line(p9.aes(group="model")) + p9.geom_errorbar( p9.aes(ymin="auroc_lower", ymax="auroc_upper", group="model")) + p9.theme_seaborn() + p9.labs(title="DaG Tune Set AUROC", color="Model") + p9.scale_color_manual({ "disc_model": "blue", "gen_model": "orange" }) + p9.scale_y_continuous(limits=[0.4, 0.75])) # In[10]: (p9.ggplot(dev_set_stats_df, p9.aes(x="factor(lf_num)", y="aupr_mean", color="model")) + p9.geom_point() + p9.geom_line(p9.aes(group="model")) + p9.geom_errorbar( p9.aes(ymin="aupr_lower", ymax="aupr_upper", group="model")) + p9.theme_seaborn() + p9.labs(title="DaG Tune Set AUPR", color="Model") + p9.scale_color_manual({ "disc_model": "blue", "gen_model": "orange" }) + p9.scale_y_continuous(limits=[0.4, 0.75])) # In[11]:
def area_plot(df, x, y, group=None, facet_x=None, facet_y=None, aggfun='sum', fill=False, sort_groups=True, base_size=10, figure_size=(6, 3)): ''' Aggregates data in df and plots as a stacked area chart. Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis y : str quoted expression to be plotted on the y axis group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet aggfun : str or fun function to be used for aggregating (eg sum, mean, median ...) fill : bool plot shares for each group instead of absolute values sort_groups : bool sort groups by the sum of their value (otherwise alphabetical order is used) base_size : int base size for theme_ez figure_size :tuple of int figure size Returns ------- g : EZPlot EZplot object ''' # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x, group, facet_x, facet_y]): names[label], groups[label] = unname(var) names['y'], variables['y'] = unname(y) # fix special cases if x == '.index': groups['x'] = '.index' names[ 'x'] = dataframe.index.name if dataframe.index.name is not None else '' # aggregate data and reorder columns gdata = agg_data(dataframe, variables, groups, aggfun, fill_groups=True) gdata['y'].fillna(0, inplace=True) gdata = gdata[[ c for c in ['x', 'y', 'group', 'facet_x', 'facet_y'] if c in gdata.columns ]] if fill: groups_to_normalize = [ c for c in ['x', 'facet_x', 'facet_y'] if c in gdata.columns ] total_values = gdata \ .groupby(groups_to_normalize)['y'] \ .sum() \ .reset_index() \ .rename(columns = {'y':'tot_y'}) gdata = pd.merge(gdata, total_values, on=groups_to_normalize) gdata['y'] = gdata['y'] / (gdata['tot_y'] + EPSILON) gdata.drop('tot_y', axis=1, inplace=True) ylabeller = percent_labels else: ylabeller = ez_labels # get plot object g = EZPlot(gdata) # determine order and create a categorical type if sort_groups: sort_data_groups(g) # get colors colors = np.flip(ez_colors(g.n_groups('group'))) # set groups if group is None: g += p9.geom_area(p9.aes(x="x", y="y"), colour=None, fill=ez_colors(1)[0], na_rm=True) else: g += p9.geom_area(p9.aes(x="x", y="y", group="factor(group)", fill="factor(group)"), colour=None, na_rm=True) g += p9.scale_fill_manual(values=colors) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_timestamp('x'): g += p9.scale_x_datetime() elif g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ylabeller, expand=[0, 0, 0.1 * (not fill) + 0.03, 0]) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab(names['y']) # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) if sort_groups: g += p9.guides(fill=p9.guide_legend(reverse=True), color=p9.guide_legend(reverse=True)) return g
def plot_difference(adata, plot_name='cellbender_results'): # Get the differences in counts per cell X_raw_minus_cb = adata.layers['counts_raw'] - adata.layers[ 'counts_cellbender'] X_dif = abs(X_raw_minus_cb) # Get the top most different genes df_diff_genes = pd.DataFrame(data=adata.var.gene_symbols.values) df_diff_genes['ensembl_id'] = adata.var.index df_diff_genes['gene_symbols'] = adata.var.gene_symbols.values df_diff_genes['dif_across_cells'] = np.asarray( X_dif.sum(axis=0)).reshape(-1) df_diff_genes = df_diff_genes.sort_values('dif_across_cells', ascending=False) # Select the top 100 genes and plot the difference in counts across # cells where x axis = gene, y axis = difference, and point = cell. top_n_genes = 100 df_plt = _make_data_plot_difference( adata, X_raw_minus_cb, df_diff_genes['gene_symbols'].head(n=top_n_genes)) # print(df_plt.head()) gplt = plt9.ggplot(df_plt) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_boxplot(plt9.aes(x='gene_symbols', y='value'), alpha=0.25 #outlier_shape='' ) gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=90)) gplt = gplt + plt9.labs( x='', y='Raw counts - cellbender adjusted', title='Top {} most different genes'.format(top_n_genes)) gplt.save( '{}-count_difference-boxplot.png'.format(plot_name), #dpi=300, width=14, height=4) gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=0)) gplt = gplt + plt9.coord_flip() gplt.save( '{}-count_difference-boxplot_vertical.png'.format(plot_name), #dpi=300, width=4, height=14) # Same plot but the abs difference on log scale df_plt = _make_data_plot_difference( adata, X_dif, df_diff_genes['gene_symbols'].head(n=top_n_genes)) df_plt['value'] += 1 # print(df_plt.head()) gplt = plt9.ggplot(df_plt) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_boxplot(plt9.aes(x='gene_symbols', y='value'), alpha=0.25 # outlier_shape='' ) gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=90)) gplt = gplt + plt9.labs( x='', y='Abs(raw counts - cellbender adjusted)', title='Top {} most different genes'.format(top_n_genes)) gplt = gplt + plt9.scale_y_continuous( trans='log10', labels=comma_labels, minor_breaks=0) gplt.save( '{}-abs_count_difference-boxplot.png'.format(plot_name), #dpi=300, width=14, height=4) gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=0)) gplt = gplt + plt9.coord_flip() gplt.save( '{}-abs_count_difference-boxplot_vertical.png'.format(plot_name), #dpi=300, width=4, height=14)
def plot_line(data,nuclstr,columns=['value'],ymin=None,ymax=None,dpi=300,features=None,feature_types=['all'],add_features=[],funcgroups=None,shading_modes=['charge_functional'],right_overhang_fix=None,debug=False,startnumber=1,cropseq=(0,None),aspect_ratio=None,reverse_seq=False,transparent=True,xshift=0): """ A wrapper function to make a plot of data with bars along the sequnce input should be a dataframe with resid, segid column and 'value' This one is inspired by seqplot/seqplot/pdb_plot.py funcgroup example fg="\\funcgroup{xxx}{CT}{White}{Green}{upper}{up} \\funcgroup{xxx}{GA}{White}{Blue}{upper}{up}" """ if isinstance(columns,str): columns=[columns] segid=data['segid'].values[0] title="Segid: %s, Type: %s"%(segid,nuclstr.components[segid]['type']) seq=Seq(str(nuclstr.seqs[segid]['fullseq']),generic_protein \ if nuclstr.components[segid]['entity'] is 'DNA' or 'histone' or 'protein' else generic_dna) msar=MultipleSeqAlignment([SeqRecord(seq=seq,id=nuclstr.components[segid]['type']+':'+segid,\ name=nuclstr.components[segid]['type']+':'+segid)]) if(reverse_seq): logger.info("Experimental feature will reverse the sequence") msar[0].seq=msar[0].seq[::-1] msar=msar[:,cropseq[0]:cropseq[1]] # print("Seq to plot:",msar) #We need to get starting residue, currently for DNA chains only cifseq gets it correctly resid_start=nuclstr.seqs[segid]['resid_start'] logger.debug("Starting resid %d"%int(resid_start)) overhang=nuclstr.seqs[segid]['overhangL'] datafixed=data.copy() datafixed.loc[:,'resid']=datafixed.loc[:,'resid']-resid_start+overhang+1-cropseq[0]+xshift # print(datafixed) sl=len(msar[0].seq) # fn=shade.seqfeat2shadefeat(msar,feature_types=feature_types,force_feature_pos='bottom',debug=debug) if features is None: fn=nuclstr.shading_features[segid] else: fn=features fn2=[] for i in fn: if (i['style'] in feature_types) or ('all' in feature_types) : fn2.append(i) fn2.extend(add_features) shaded=ipyshade.shadedmsa4plot(msar,features=fn2,shading_modes=shading_modes,debug=debug,startnumber=startnumber,setends=[startnumber-2,sl+startnumber+2],funcgroups=funcgroups,density=200) #If sl%10=10 se will have a ruler number hanging beyond the sequence image, and we need to correct for that. if right_overhang_fix is None: if sl%10==0: if sl<100: rof= 0.1 else: rof=0.5 else: rof=0 else: rof=right_overhang_fix if (not aspect_ratio is None ): ar=aspect_ratio else: ar=0.15*100./sl md=pd.melt(datafixed,id_vars=['segid','resid'],value_vars=columns) # print(md) # print(md) # print(md['variable']) plot=(ggplot(data=md,mapping=aes(x='resid', y='value')) + geom_point(aes(color='variable'),size=0.1)+geom_line(aes(color='variable'),stat='identity') + scale_x_continuous(limits=(0.5,sl+0.5+rof),expand=(0,0.2),name='',breaks=[]) # + scale_y_continuous() + theme_light()+theme(aspect_ratio=ar,dpi=dpi,plot_margin=0)) #+ facet_wrap('~ segid',dir='v') if ymax is not None: plot=plot+scale_y_continuous(limits=(None,ymax)) if ymin is None: ymin=md['value'].min() if ymax is None: ymax=md['value'].max() plot = plot + geom_seq_x(seqimg=shaded.img,\ xlim=(1,sl+rof),ylim=(ymin,ymax),aspect_ratio=ar,transparent=transparent)+ggtitle(title) return plot
# set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_timestamp('x'): g += p9.scale_x_datetime() elif g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab(names['y']) # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) return g
def batch_plots(self): # First, put together active leak data and output for live plotting functionality # (no AL plot here currently) dfs = self.active_leak_dfs for i in range(len(dfs)): n_cols = dfs[i].shape[1] dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1) dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1) dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1) dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1) dfs[i]['program'] = self.directories[i] # Move reference program to the top of the list for i, df in enumerate(dfs): if df['program'].iloc[0] == self.ref_program: dfs.insert(0, dfs.pop(i)) # Arrange dfs for plot 1 dfs_p1 = dfs.copy() for i in range(len(dfs_p1)): # Reshape dfs_p1[i] = pd.melt(dfs_p1[i], id_vars=['datetime', 'mean', 'std', 'low', 'high', 'program']) # Combine dataframes into single dataframe for plotting df_p1 = dfs_p1[0] for i in dfs_p1[1:]: df_p1 = df_p1.append(i, ignore_index=True) # Output Emissions df for other uses (e.g. live plot) df_p1.to_csv(self.output_directory + 'mean_active_leaks.csv', index=True) # Now repeat for emissions (which will actually be used for batch plotting) dfs = self.emission_dfs for i in range(len(dfs)): n_cols = dfs[i].shape[1] dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1) dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1) dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1) dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1) dfs[i]['program'] = self.directories[i] # Move reference program to the top of the list for i, df in enumerate(dfs): if df['program'].iloc[0] == self.ref_program: dfs.insert(0, dfs.pop(i)) # Arrange dfs for plot 1 dfs_p1 = dfs.copy() for i in range(len(dfs_p1)): # Reshape dfs_p1[i] = pd.melt(dfs_p1[i], id_vars=['datetime', 'mean', 'std', 'low', 'high', 'program']) # Combine dataframes into single dataframe for plotting df_p1 = dfs_p1[0] for i in dfs_p1[1:]: df_p1 = df_p1.append(i, ignore_index=True) # Output Emissions df for other uses (e.g. live plot) df_p1.to_csv(self.output_directory + 'mean_emissions.csv', index=True) # Make plots from list of dataframes - one entry per dataframe pn.theme_set(pn.theme_linedraw()) plot1 = (pn.ggplot(None) + pn.aes('datetime', 'value', group='program') + pn.geom_ribbon(df_p1, pn.aes(ymin='low', ymax='high', fill='program'), alpha=0.2) + pn.geom_line(df_p1, pn.aes('datetime', 'mean', colour='program'), size=1) + pn.ylab('Daily emissions (kg/site)') + pn.xlab('') + pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) + pn.scale_x_datetime(labels=date_format('%Y')) + pn.scale_y_continuous(trans='log10') + pn.ggtitle('To reduce uncertainty, use more simulations.') + pn.labs(color='Program', fill='Program') + pn.theme(panel_border=pn.element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=pn.element_blank(), panel_grid_major_x=pn.element_blank(), panel_grid_minor_y=pn.element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=pn.element_line( colour='black', linewidth=1, alpha=0.5)) ) plot1.save(self.output_directory + 'program_comparison.png', width=7, height=3, dpi=900) # Build relative mitigation plots dfs_p2 = dfs.copy() for i in dfs_p2[1:]: i['mean_dif'] = 0 i['std_dif'] = 0 i['mean_ratio'] = 0 i['std_ratio'] = 0 for j in range(len(i)): ref_mean = dfs_p2[0].loc[dfs_p2[0].index[j], 'mean'] ref_std = dfs_p2[0].loc[dfs_p2[0].index[j], 'std'] alt_mean = i.loc[i.index[j], 'mean'] alt_std = i.loc[i.index[j], 'std'] i.loc[i.index[j], 'mean_dif'] = alt_mean - ref_mean i.loc[i.index[j], 'std_dif'] = math.sqrt( math.pow(alt_std, 2) + math.pow(ref_std, 2)) i.loc[i.index[j], 'mean_ratio'] = alt_mean / ref_mean i.loc[i.index[j], 'std_ratio'] = math.sqrt( math.pow((alt_std / alt_mean), 2) + math.pow((ref_std / ref_mean), 2)) # Build plotting dataframe df_p2 = self.dates_trunc.copy().to_frame() df_p2['program'] = dfs_p2[1]['program'] df_p2['mean_dif'] = dfs_p2[1]['mean_dif'] df_p2['std_dif'] = dfs_p2[1]['std_dif'] df_p2['mean_ratio'] = dfs_p2[1]['mean_ratio'] df_p2['std_ratio'] = dfs_p2[1]['std_ratio'] df_p2['low_dif'] = dfs_p2[1]['mean_dif'] - 2 * dfs_p2[1]['std_dif'] df_p2['high_dif'] = dfs_p2[1]['mean_dif'] + 2 * dfs_p2[1]['std_dif'] df_p2['low_ratio'] = dfs_p2[1]['mean_ratio'] / (dfs_p2[1] ['mean_ratio'] + 2 * dfs_p2[1]['std_ratio']) df_p2['high_ratio'] = dfs_p2[1]['mean_ratio'] + 2 * dfs_p2[1]['std_ratio'] pd.options.mode.chained_assignment = None for i in dfs_p2[2:]: i['low_dif'] = i['mean_dif'] - 2 * i['std_dif'] i['high_dif'] = i['mean_dif'] + 2 * i['std_dif'] i['low_ratio'] = i['mean_ratio'] / (i['mean_ratio'] + 2 * i['std_ratio']) i['high_ratio'] = i['mean_ratio'] + 2 * i['std_ratio'] short_df = i[['program', 'mean_dif', 'std_dif', 'low_dif', 'high_dif', 'mean_ratio', 'std_ratio', 'low_ratio', 'high_ratio']] short_df['datetime'] = np.array(self.dates_trunc) df_p2 = df_p2.append(short_df, ignore_index=True) # Make plot 2 plot2 = (pn.ggplot(None) + pn.aes('datetime', 'mean_dif', group='program') + pn.geom_ribbon( df_p2, pn.aes(ymin='low_dif', ymax='high_dif', fill='program'), alpha=0.2) + pn.geom_line(df_p2, pn.aes('datetime', 'mean_dif', colour='program'), size=1) + pn.ylab('Daily emissions difference (kg/site)') + pn.xlab('') + pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) + pn.scale_x_datetime(labels=date_format('%Y')) + pn.ggtitle('Daily differences may be uncertain for small sample sizes') + # pn.scale_y_continuous(trans='log10') + pn.labs(color='Program', fill='Program') + pn.theme(panel_border=pn.element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=pn.element_blank(), panel_grid_major_x=pn.element_blank(), panel_grid_minor_y=pn.element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=pn.element_line( colour='black', linewidth=1, alpha=0.5)) ) plot2.save(self.output_directory + 'relative_mitigation.png', width=7, height=3, dpi=900) # Make plot 3 plot3 = (pn.ggplot(None) + pn.aes('datetime', 'mean_ratio', group='program') + pn.geom_ribbon(df_p2, pn.aes( ymin='low_ratio', ymax='high_ratio', fill='program'), alpha=0.2) + pn.geom_hline(yintercept=1, size=0.5, colour='blue') + pn.geom_line(df_p2, pn.aes('datetime', 'mean_ratio', colour='program'), size=1) + pn.ylab('Emissions ratio') + pn.xlab('') + pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) + pn.scale_x_datetime(labels=date_format('%Y')) + pn.ggtitle( 'Blue line represents equivalence. \nIf uncertainty is high, use more ' 'simulations and/or sites. \nLook also at ratio of mean daily emissions' 'over entire timeseries.') + pn.labs(color='Program', fill='Program') + pn.theme(panel_border=pn.element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=pn.element_blank(), panel_grid_major_x=pn.element_blank(), panel_grid_minor_y=pn.element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=pn.element_line( colour='black', linewidth=1, alpha=0.5)) ) plot3.save(self.output_directory + 'relative_mitigation2.png', width=7, height=3, dpi=900) # --------------------------------------- # ------ Figure to compare costs ------ dfs = self.cost_dfs for i in range(len(dfs)): n_cols = dfs[i].shape[1] dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1) dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1) dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1) dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1) dfs[i]['program'] = self.directories[i] # Move reference program to the top of the list for i, df in enumerate(dfs): if df['program'].iloc[0] == self.ref_program: dfs.insert(0, dfs.pop(i)) # Arrange dfs for plot 1 dfs_p1 = dfs.copy() for i in range(len(dfs_p1)): # Reshape dfs_p1[i] = pd.melt(dfs_p1[i], id_vars=['datetime', 'mean', 'std', 'low', 'high', 'program']) # Combine dataframes into single dataframe for plotting df_p1 = dfs_p1[0] for i in dfs_p1[1:]: df_p1 = df_p1.append(i, ignore_index=True) # Output Emissions df for other uses (e.g. live plot) df_p1.to_csv(self.output_directory + 'rolling_cost_estimates.csv', index=True) # Make plots from list of dataframes - one entry per dataframe pn.theme_set(pn.theme_linedraw()) plot1 = (pn.ggplot(None) + pn.aes('datetime', 'value', group='program') + pn.geom_ribbon(df_p1, pn.aes(ymin='low', ymax='high', fill='program'), alpha=0.2) + pn.geom_line(df_p1, pn.aes('datetime', 'mean', colour='program'), size=1) + pn.ylab('Estimated cost per facility') + pn.xlab('') + pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) + pn.scale_x_datetime(labels=date_format('%Y')) + # pn.scale_y_continuous(trans='log10') + pn.labs(color='Program', fill='Program') + pn.theme(panel_border=pn.element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=pn.element_blank(), panel_grid_major_x=pn.element_blank(), panel_grid_minor_y=pn.element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=pn.element_line( colour='black', linewidth=1, alpha=0.5)) ) plot1.save(self.output_directory + 'cost_estimate_temporal.png', width=7, height=3, dpi=900) ######################################## # Cost breakdown by program and method method_lists = [] for i in range(len(self.directories)): df = pd.read_csv( self.output_directory + self.directories[i] + "/timeseries_output_0.csv") df = df.filter(regex='cost$', axis=1) df = df.drop(columns=["total_daily_cost"]) method_lists.append(list(df)) costs = [[] for i in range(len(self.all_data))] for i in range(len(self.all_data)): for j in range(len(self.all_data[i])): simcosts = [] for k in range(len(method_lists[i])): timesteps = len(self.all_data[i][j][method_lists[i][k]]) simcosts.append( (sum(self.all_data[i][j][method_lists[i][k]])/timesteps/self.n_sites)*365) costs[i].append(simcosts) rows_list = [] for i in range(len(costs)): df_temp = pd.DataFrame(costs[i]) for j in range(len(df_temp.columns)): dict = {} dict.update({'Program': self.directories[i]}) dict.update({'Mean Cost': round(df_temp.iloc[:, j].mean())}) dict.update({'St. Dev.': df_temp.iloc[:, j].std()}) dict.update({'Method': method_lists[i][j].replace('_cost', '')}) rows_list.append(dict) df = pd.DataFrame(rows_list) # Output Emissions df for other uses df.to_csv(self.output_directory + 'cost_comparison.csv', index=True) plot = ( pn.ggplot( df, pn.aes( x='Program', y='Mean Cost', fill='Method', label='Mean Cost')) + pn.geom_bar(stat="identity") + pn.ylab('Cost per Site per Year') + pn.xlab('Program') + pn.scale_fill_hue(h=0.15, l=0.25, s=0.9) + pn.geom_text(size=15, position=pn.position_stack(vjust=0.5)) + pn.theme( panel_border=pn.element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=pn.element_blank(), panel_grid_major_x=pn.element_blank(), panel_grid_minor_y=pn.element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=pn.element_line( colour='black', linewidth=1, alpha=0.5))) plot.save(self.output_directory + 'cost_comparison.png', width=7, height=3, dpi=900) return
def density_plot(df, x, group=None, facet_x=None, facet_y=None, position='overlay', sort_groups=True, base_size=10, figure_size=(6, 3), **stat_kwargs): ''' Plot a 1-d density plot Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet position : str if groups are present, choose between `stack` or `overlay` base_size : int base size for theme_ez figure_size :tuple of int figure size stat_kwargs : kwargs kwargs for the density stat Returns ------- g : EZPlot EZplot object ''' if position not in ['overlay', 'stack']: log.error("position not recognized") raise NotImplementedError("position not recognized") # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x, group, facet_x, facet_y]): names[label], groups[label] = unname(var) # fix special cases if x == '.index': groups['x'] = '.index' names[ 'x'] = dataframe.index.name if dataframe.index.name is not None else '' # aggregate data and reorder columns gdata = agg_data(dataframe, variables, groups, None, fill_groups=False) gdata = gdata[[ c for c in ['x', 'group', 'facet_x', 'facet_y'] if c in gdata.columns ]] # start plotting g = EZPlot(gdata) # determine order and create a categorical type colors = ez_colors(g.n_groups('group')) # set groups if group is None: g += p9.geom_density(p9.aes(x="x"), stat=p9.stats.stat_density(**stat_kwargs), colour=ez_colors(1)[0], fill=ez_colors(1)[0], **POSITION_KWARGS[position]) else: g += p9.geom_density(p9.aes(x="x", group="factor(group)", colour="factor(group)", fill="factor(group)"), stat=p9.stats.stat_density(**stat_kwargs), **POSITION_KWARGS[position]) g += p9.scale_fill_manual(values=colors, reverse=False) g += p9.scale_color_manual(values=colors, reverse=False) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab('Density') # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) if sort_groups: g += p9.guides(fill=p9.guide_legend(reverse=True)) return g
def plot_ecdf(df_plot, variable_column, color_column='none', output_file='plot_distribution', facet_column='none', x_log10=False): """Plot plot_distribution to png. Parameters ---------- df_plot : pandas.DataFrame DataFrame with <variable_column> as a column. variable_column : string String of variable_column column to plot. color_column : string String of color column to plot. output_file : string Basename of output file. facet_column : string Column to facet the plot by. Returns ------- NULL """ n_colors = 0 if color_column != 'none': gplt = plt9.ggplot(df_plot, plt9.aes(x=variable_column, color=color_column)) n_colors = df_plot[color_column].nunique() else: gplt = plt9.ggplot(df_plot, plt9.aes(x=variable_column)) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.stat_ecdf(alpha=0.8) if x_log10: gplt = gplt + plt9.scale_x_continuous( trans='log10', # labels=comma_labels, minor_breaks=0) else: gplt = gplt + plt9.scale_x_continuous( # trans='log10', # labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.scale_y_continuous( # trans='log10', # labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.labs(y='Cumulative density', title='') if n_colors != 0 and n_colors > 20: gplt = gplt + plt9.theme(legend_position='none') elif n_colors != 0 and n_colors < 9: gplt = gplt + plt9.scale_colour_brewer(palette='Dark2', type='qual') if facet_column != 'none': gplt = gplt + plt9.facet_wrap('~ {}'.format(facet_column), ncol=5) n_facets = df_plot[facet_column].nunique() gplt.save('{}.png'.format(output_file), dpi=300, width=6 * (n_facets / 4), height=4 * (n_facets / 4), limitsize=False) else: gplt.save('{}.png'.format(output_file), dpi=300, width=4, height=4) return 0
def generate_map(data, region, value_field, iso_field='iso', scale_params=None, plot_na_dots=False, tolerance=None, plot_size=8, out_region_color='#f0f0f0', na_color='#aaaaaa', line_color='#666666', projection=None): """ This function returns a map plot with the specified options. :param pandas.DataFrame data: Data to be plotted. :param str region: Region to center the map around. Countries outside the chosen region will be obscured. :param str value_field: Column of *data* with the values to be plotted. :param str iso_field: Column of *data* with the ISO3 codes for each country. :param dict scale_params: Dictionary of parameters to be passed to the ggplot corresponding color scale (continuous or discrete). :param bool plot_na_dots: Whether to plot the dots for small countries if said country doesn't have data available. :param int tolerance: Coordinate tolerance for polygon simplification, a higher number will result in simpler polygons and faster rendering (see DEFAULT_TOLERANCES). :param int plot_size: Size of the plot, which determines the relative sizes of the elements within. :param str out_region_color: Hex color of the countries that are out of the specified region. :param str na_color: Hex color of the countries with no data available. :param str line_color: Color of the country borders. :param str projection: Kind of map projection to be used in the map. Currently, Oceania (XOX) is only available in ESPG:4326 to enable wrapping. :returns: a ggplot-like plot with the map :rtype: plotnine.ggplot """ if projection is None: if region == 'XOX': projection = 'epsg4326' else: projection = 'robinson' if projection not in PROJECTION_DICT.keys(): raise ValueError('Projection "{}" not valid'.format(projection)) if scale_params is None: scale_params = {} if region not in REGION_BOUNDS[projection]: raise ValueError( '"region" not available. Valid regions are: {}'.format(', '.join( REGION_BOUNDS[projection].keys()))) if tolerance is None: tolerance = DEFAULT_TOLERANCES[projection][region] countries = GeoDataFrame.from_file( os.path.join(os.path.dirname(__file__), 'data/world-countries.shp')) # To plot Oceania we need the original EPSG:4326 to wrap around the 180º # longitude. In other cases transform to the desired projection. if region == 'XOX': countries.crs['lon_wrap'] = '180' # Wrap around longitude 180º XOX_countries = countries['continent'] == 'XOX' countries[XOX_countries] = countries[XOX_countries].to_crs( countries.crs) centroids = countries[XOX_countries].apply( lambda row: row['geometry'].centroid, axis=1) countries.loc[XOX_countries, 'lon'] = [c.x for c in centroids] countries.loc[XOX_countries, 'lat'] = [c.y for c in centroids] else: if projection != 'epsg4326': countries = countries.to_crs(PROJECTION_DICT[projection]) centroids = countries.apply(lambda row: row['geometry'].centroid, axis=1) countries['lon'] = [c.x for c in centroids] countries['lat'] = [c.y for c in centroids] countries['geometry'] = countries['geometry'].simplify(tolerance) upper_left, lower_right = REGION_BOUNDS[projection][region] limits_x = [upper_left[0], lower_right[0]] limits_y = [lower_right[1], upper_left[1]] ratio = (limits_x[1] - limits_x[0]) / (limits_y[1] - limits_y[0]) plot_data = pd.merge(countries, data, how='left', left_on='iso', right_on=iso_field) map_bounds = REGION_BOUNDS['epsg4326'][region] map_area = ((map_bounds[1][0] - map_bounds[0][0]) * (map_bounds[0][1] - map_bounds[1][1])) plot_data['plot_dot'] = (plot_data['pol_area'] < DOT_THRESHOLD * map_area) if not plot_na_dots: plot_data['plot_dot'] &= ~pd.isnull(plot_data[value_field]) if region != 'XWX': in_region = ((~pd.isnull(plot_data[value_field])) & (plot_data['continent'] == region)) in_region_missing = ((pd.isnull(plot_data[value_field])) & (plot_data['continent'] == region)) out_region = plot_data['continent'] != region else: in_region = ~pd.isnull(plot_data[value_field]) in_region_missing = pd.isnull(plot_data[value_field]) out_region = np.repeat(False, len(plot_data)) if plot_data[value_field].dtype == 'object': # Assume discrete values fill_scale = scale_fill_brewer(**scale_params, drop=False) else: # Assume continuous values fill_scale = scale_fill_gradient(**scale_params) plot_data_values = plot_data[in_region] plot_data_missing = plot_data[in_region_missing] plot_data_out_region = plot_data[out_region] dots_region = plot_data_values[plot_data_values['plot_dot']] dots_region_missing = plot_data_missing[plot_data_missing['plot_dot']] dots_out_region = plot_data_out_region[plot_data_out_region['plot_dot']] plt = ( ggplot() + geom_map(plot_data_values, aes(fill=value_field), color=line_color, size=0.3) + geom_map( plot_data_missing, aes(color='plot_dot'), fill=na_color, size=0.3) + geom_map(plot_data_out_region, fill=out_region_color, color=line_color, size=0.3) + geom_point(dots_region, aes(x='lon', y='lat', fill=value_field), size=3, stroke=.1, color=line_color) + geom_point(dots_region_missing, aes(x='lon', y='lat'), fill=na_color, size=3, stroke=.1, color=line_color) + geom_point(dots_out_region, aes(x='lon', y='lat'), fill=out_region_color, size=3, stroke=.1, color=line_color) + scale_x_continuous(breaks=[], limits=limits_x) + scale_y_continuous(breaks=[], limits=limits_y) + theme( figure_size=(plot_size * ratio, plot_size), panel_background=element_rect(fill='white', color='black'), # panel_border=element_rect(fill='white', # color='black', # size=.1), legend_background=element_rect( fill="white", color='black', size=.5), legend_box_just='left') + xlab('') + ylab('')) if len(plot_data_values.index) > 0: plt += fill_scale plt += scale_color_manual(name=' ', values=[line_color], breaks=[False], labels=['No data available']) if plot_data[value_field].dtype == 'object': plt += guides(fill=guide_legend(override_aes={'shape': None})) return { 'plot': plt, 'ratio': ratio, }
def main(): """Run CLI.""" parser = argparse.ArgumentParser(description=""" Read AnnData object and list of phenotypes. Plot boxplots of \ phenotypes across clusters. """) parser.add_argument('-h5', '--h5_anndata', action='store', dest='h5', required=True, help='H5 AnnData file.') parser.add_argument('--pheno_columns', action='store', dest='pheno_columns', default='', help='Pheno column to be boxplotted by cluster.') parser.add_argument( '-of', '--output_file', action='store', dest='of', default='plot_boxplot_cluster', help='Basename of output png file. Will have .png appended.\ (default: %(default)s)') options = parser.parse_args() adata = sc.read_h5ad(filename=options.h5) pheno_to_plot = options.pheno_columns.split(',') plt_height = 4 plt_width = 16 # Plot the data. for pheno in pheno_to_plot: # plt_width = adata.obs['cluster'].nunique() * 0.25 gplt = plt9.ggplot(adata.obs) gplt = gplt + plt9.geom_boxplot(plt9.aes(x='cluster', y=pheno)) gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=90)) gplt.save('boxplot-{}.png'.format(pheno), dpi=300, width=plt_width, height=plt_height) # Add log10 transformation plot lab = 'log10' if adata.obs[pheno].min() < 0: adata.obs[pheno] = adata.obs[pheno] + abs( adata.obs[pheno].min()) + 1 lab = 'plusmin1log10' elif adata.obs[pheno].min() == 0: adata.obs[pheno] = adata.obs[pheno] + 1 lab = 'plus1log10' gplt = plt9.ggplot(adata.obs) gplt = gplt + plt9.geom_boxplot(plt9.aes(x='cluster', y=pheno)) gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=90)) gplt = gplt + plt9.scale_y_continuous( trans='log10', # labels=comma_labels, minor_breaks=0) gplt.save('boxplot_{}-{}.png'.format(lab, pheno), dpi=300, width=plt_width, height=plt_height)