def test_legend_fill_ratio(): p = (ggplot(df_linear, aes('x', color='x<0.5')) + geom_point(aes(y='y_noisy')) + geom_smooth(aes(y='y_noisy'), method='lm', size=0.5, span=.3) ) assert p == 'legend_fill_ratio'
def test_step(): p = (ggplot(df, aes('x')) + geom_step(aes(y='y'), size=4) + geom_step(aes(y='y+2'), color='red', direction='vh', size=4)) assert p == 'step'
def test_non_linear_smooth_no_ci(): p = (ggplot(df_linear, aes('x')) + geom_point(aes(y='y_noisy')) + geom_smooth(aes(y='y_noisy'), method='loess', span=.3, color='blue', se=False) ) assert p == 'non_linear_smooth_no_ci'
def test_linear_smooth(): p = (ggplot(df_linear, aes('x')) + geom_point(aes(y='y_noisy')) + geom_smooth(aes(y='y_noisy'), method='lm', span=.3, color='blue') ) assert p == 'linear_smooth'
def plot(): outdir = 'output/protobowl/' pathlib.Path(outdir).mkdir(parents=True, exist_ok=True) df = load_protobowl() df.result = df.result.apply(lambda x: x is True) df['log_n_records'] = df.user_n_records.apply(np.log) df_user_grouped = df.groupby('uid') user_stat = df_user_grouped.agg(np.mean) print('{} users'.format(len(user_stat))) print('{} records'.format(len(df))) max_color = user_stat.log_n_records.max() user_stat['alpha'] = pd.Series( user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index) # 2D user plot p0 = ggplot(user_stat) \ + geom_point(aes(x='relative_position', y='result', size='user_n_records', color='log_n_records', alpha='alpha'), show_legend={'color': False, 'alpha': False, 'size': False}) \ + scale_color_gradient(high='#e31a1c', low='#ffffcc') \ + labs(x='Average buzzing position', y='Accuracy') \ + theme(aspect_ratio=1) p0.save(os.path.join(outdir, 'protobowl_users.pdf')) # p0.draw() print('p0 done') # histogram of number of records p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \ + geom_histogram(color='#e6550d', fill='#fee6ce') \ + geom_density() \ + labs(x='Log number of records', y='Density') \ + theme(aspect_ratio=0.3) p1.save(os.path.join(outdir, 'protobowl_hist.pdf')) # p1.draw() print('p1 done') # histogram of accuracy p2 = ggplot(user_stat, aes(x='result', y='..density..')) \ + geom_histogram(color='#31a354', fill='#e5f5e0') \ + geom_density() \ + labs(x='Accuracy', y='Density') \ + theme(aspect_ratio=0.3) p2.save(os.path.join(outdir, 'protobowl_acc.pdf')) # p2.draw() print('p2 done') # histogram of buzzing position p3 = ggplot(user_stat, aes(x='relative_position', y='..density..')) \ + geom_histogram(color='#3182bd', fill='#deebf7') \ + geom_density() \ + labs(x='Average buzzing position', y='Density') \ + theme(aspect_ratio=0.3) p3.save(os.path.join(outdir, 'protobowl_pos.pdf')) # p3.draw() print('p3 done')
def test_arrow(): p = (ggplot(df, aes('x', 'y')) + geom_path(size=2, arrow=arrow(ends='both', type='closed')) + geom_path(aes(y='y+2'), color='red', size=2, arrow=arrow(angle=60, length=1, ends='first')) + geom_path(aes(y='y+4'), color='blue', size=2, arrow=arrow(length=1))) assert p == 'arrow'
def test_quantiles_width_dodge(): p = (ggplot(df, aes('x')) + geom_violin(aes(y='y'), draw_quantiles=[.25, .75], size=2) + geom_violin(aes(y='y+25'), color='green', width=0.5, size=2) + geom_violin(aes(y='y+50', fill='factor(y%2)'), size=2) + theme(subplots_adjust={'right': 0.85})) assert p == 'quantiles_width_dodge'
def test_aesthetics(): df = pd.DataFrame({ 'a': range(5), 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9 }) p = (ggplot(df, aes(y='a')) + geom_point(aes(x='b')) + geom_point(aes(x='c', size='a')) + geom_point(aes(x='d', alpha='a'), size=10, show_legend=False) + geom_point(aes(x='e', shape='factor(a)'), size=10, show_legend=False) + geom_point(aes(x='f', color='factor(a)'), size=10, show_legend=False) + geom_point(aes(x='g', fill='a'), stroke=0, size=10, show_legend=False) + geom_point(aes(x='h', stroke='a'), fill='white', color='green', size=10) + geom_point(aes(x='i', shape='factor(a)'), fill='brown', stroke=2, size=10, show_legend=False) + theme(subplots_adjust={'right': 0.85})) assert p == 'aesthetics'
def test_arrow(): p = (ggplot(df, aes('x', 'y', xend='xend', yend='yend')) + geom_segment(aes('x+2', xend='xend+2'), arrow=arrow(), size=2) + geom_segment(aes('x+4', xend='xend+4'), arrow=arrow(ends='first'), size=2) + geom_segment(aes('x+6', xend='xend+6'), arrow=arrow(ends='both'), size=2) ) assert p == 'arrow'
def test_aesthetics(): p = (ggplot(df, aes('x', 'y', xend='xend', yend='yend')) + geom_segment(size=2) + # Positive slope segments geom_segment(aes(yend='yend+1', color='factor(z)'), size=2) + geom_segment(aes(yend='yend+2', linetype='factor(z)'), size=2) + geom_segment(aes(yend='yend+3', size='z'), show_legend=False) + geom_segment(aes(yend='yend+4', alpha='z'), size=2, show_legend=False)) assert p + _theme == 'aesthetics'
def test_aesthetics(): p = (ggplot(df, aes('x', 'y')) + geom_path(size=4) + geom_path(aes(y='y+2', alpha='x'), size=4, show_legend=False) + geom_path(aes(y='y+4'), size=4, linetype='dashed', show_legend=False) + geom_path(aes(y='y+6', size='x'), color='red', show_legend=False) + geom_path(aes(y='y+8', color='x'), size=4)) assert p == 'aesthetics'
def test_tile_aesthetics(): p = (ggplot(df, aes('x', 'y', width=1, height=1)) + geom_tile() + geom_tile(aes(y='y+2', alpha='z'), show_legend=False) + geom_tile(aes(y='y+4', fill='factor(z)')) + geom_tile(aes(y='y+6', color='factor(z+1)'), size=2) + geom_tile(aes(y='y+8', linetype='factor(z+2)'), color='yellow', size=2) + _theme) assert p == 'tile-aesthetics'
def test_rect_nofill(): p = (ggplot(df) + aes(xmin='xmin', xmax='xmax', ymin='ymin', ymax='ymax') + geom_rect(color='red', fill=None, size=2) + geom_rect(aes(ymin='ymin+2', ymax='ymax+2'), color='blue', fill='None', size=2) + geom_rect(aes(ymin='ymin+4', ymax='ymax+4'), color='green', fill='', size=2) + geom_rect(aes(ymin='ymin+6', ymax='ymax+6'), color='yellow', fill='gray', size=2)) assert p == 'rect-nofill'
def test_no_fill(): df = pd.DataFrame({'x': range(5), 'y': range(5)}) p = (ggplot(df, aes('x', 'y')) + geom_point(color='red', fill=None, size=5, stroke=1.5) + geom_point(aes(y='y+1'), color='blue', fill='none', size=5, stroke=1.5) + geom_point(aes(y='y+2'), color='green', fill='', size=5, stroke=1.5) + geom_point(aes(y='y+3'), color='yellow', fill='gray', size=5, stroke=1.5)) assert p == 'no_fill'
def test_stack_negative(): df = df1.copy() _loc = df.columns.get_loc df.iloc[0, _loc('y')] *= -1 df.iloc[len(df)-1, _loc('y')] *= -1 p = (ggplot(df) + geom_col(aes('factor(x)', 'y', fill='factor(y)'), position='stack') + geom_text(aes('factor(x)', 'y', label='y'), position=position_stack(vjust=0.5)) ) assert p + _theme == 'stack-negative'
def test_line(): df2 = df.copy() # geom_path plots in given order. geom_line & # geom_step sort by x before plotting df2['x'] = df['x'].values[::-1] p = (ggplot(df2, aes('x')) + geom_path(aes(y='y'), size=4) + geom_line(aes(y='y+2'), color='blue', size=4) + geom_step(aes(y='y+4'), color='red', size=4)) assert p == 'path_line_step'
def plot_char_percent_vs_accuracy_smooth(self, category=False): if category: return ( ggplot(self.char_plot_df) + aes(x='char_percent', y='correct', color='category_jmlr') + geom_smooth() ) else: return ( ggplot(self.char_plot_df) + aes(x='char_percent', y='correct') + geom_smooth(method='mavg') )
def plot_char_percent_vs_accuracy_histogram(self, category=False): if category: return ( ggplot(self.char_plot_df) + facet_wrap('category_jmlr') + aes(x='char_percent', fill='Outcome') + geom_histogram(binwidth=.05) ) else: return ( ggplot(self.char_plot_df) + aes(x='char_percent', fill='Outcome') + geom_histogram(binwidth=.05) )
def plot_compare_accuracy(self, expo=False): if expo: return ( ggplot(self.acc_df) + facet_wrap('position') + aes(x='guesser', y='accuracy', fill='Dataset') + geom_bar(stat='identity', position='dodge') + xlab('Guessing Model') + ylab('Accuracy') ) else: return ( ggplot(self.acc_df) + facet_wrap('position') + aes(x='guesser', y='accuracy') + geom_bar(stat='identity') )
def test_limits(): p = (ggplot(df, aes('x')) + stat_function(fun=np.cos, size=2, color='blue', arrow=arrow(ends='first')) + stat_function(fun=np.cos, xlim=(10, 20), size=2, color='red', arrow=arrow(ends='last'))) assert p == 'limits'
def test_continuous_x(): n = len(df_continuous_x) p = (ggplot(df_continuous_x, aes('x', 'y')) + geom_point() + geom_smooth(df_continuous_x[3:n-3], method='loess', color='blue', fullrange=False)) assert p == 'continuous_x'
def test_stat_parameter_sharing(): # When the stat has a parameter with the same name as # the geom aesthetic,they both get their value # NOTE: This test may need to be modified when the # geom & stat internals change class stat_abc(stat): DEFAULT_PARAMS = {'geom': 'point', 'position': 'identity', 'weight': 1} REQUIRED_AES = {'x'} CREATES = {'y'} @classmethod def compute_panel(cls, data, scales, **params): return data class geom_abc(geom): DEFAULT_PARAMS = {'stat': stat_abc, 'position': 'identity'} REQUIRED_AES = {'x', 'weight'} @staticmethod def draw(pinfo, panel_params, coord, ax, **kwargs): pass # weight is manually set, it should be a stat parameter and # not a geom manual setting g = geom_abc(weight=4) assert('weight' in g.aes_params) assert('weight' in g._stat.params) g = geom_abc(aes(weight='mpg')) assert('weight' in g.mapping) assert('weight' in g._stat.params)
def test_expand_limits(): df = pd.DataFrame({'x': range(5, 11), 'y': range(5, 11)}) p = (ggplot(aes('x', 'y'), data=df) + geom_point() + expand_limits(y=(0, None)) ) assert p == 'expand_limits'
def test_bool_mapping(): df = pd.DataFrame({ 'x': [1, 2, 3], 'y': [True, False, False] }) p = ggplot(df, aes('x', 'y')) + geom_point() assert p == 'bool_mapping'
def test_normal_with_line(): p = (ggplot(df_normal, aes(sample='x')) + geom_qq() + geom_qq_line() ) # Roughly a straight line of points through the origin assert p == 'normal_with_line'
def test_aesthetics(): p = (ggplot(df) + geom_rug(aes('x', 'y'), size=2) + geom_rug(aes('x+2*n', 'y+2*n', alpha='z'), size=2, sides='tr') + geom_rug(aes('x+4*n', 'y+4*n', linetype='factor(z)'), size=2, sides='t') + geom_rug(aes('x+6*n', 'y+6*n', color='factor(z)'), size=2, sides='b') + geom_rug(aes('x+8*n', 'y+8*n', size='z'), sides='tblr')) if six.PY2: # Small displacement in y-axis text assert p + _theme == ('aesthetics', {'tol': 4}) else: assert p + _theme == 'aesthetics'
def test_summary_functions(): p = (ggplot(df, aes('x', 'y')) + stat_summary(fun_y=np.mean, fun_ymin=np.min, fun_ymax=np.max, size=2)) assert p == 'summary_functions'
def test_hull(): p = (ggplot(mtcars) + aes('wt', 'mpg', color='factor(cyl)') + geom_point() + stat_hull(size=1) ) assert p + _theme == 'hull'
def test_ribbon_facetting(): p = (ggplot(df, aes('x', ymin='ymin', ymax='ymax', fill='factor(z)')) + geom_ribbon() + facet_wrap('~ z') ) assert p + _theme == 'ribbon_facetting'
def test_discrete_x(): p = (ggplot(df, aes('xd', 'y')) + stat_summary_bin(fun_y=np.mean, fun_ymin=np.min, fun_ymax=np.max, geom='bar')) assert p == 'discrete_x'
def analyze_thermal_values(thermal_array, mask, histplot=False): """This extracts the thermal values of each pixel writes the values out to a file. It can also print out a histogram plot of pixel intensity and a pseudocolor image of the plant. Inputs: array = numpy array of thermal values mask = Binary mask made from selected contours histplot = if True plots histogram of intensity values Returns: analysis_img = output image :param thermal_array: numpy.ndarray :param mask: numpy.ndarray :param histplot: bool :return analysis_img: ggplot """ max_value = np.amax(thermal_array) # Calculate histogram hist_thermal = [ float(i[0]) for i in cv2.calcHist([np.float32(thermal_array)], [0], mask, [256], [0, max_value]) ] bin_width = max_value / 256. b = 0 bin_labels = [float(b)] for i in range(255): b += bin_width bin_labels.append(b) # Store debug mode debug = params.debug params.debug = None # apply plant shaped mask to image mask1 = binary_threshold(mask, 0, 255, 'light') params.debug = debug mask1 = (mask1 / 255) masked_thermal = thermal_array[np.where(mask > 0)] pixels = cv2.countNonZero(mask1) hist_percent = [(p / float(pixels)) * 100 for p in hist_thermal] maxtemp = np.amax(masked_thermal) mintemp = np.amin(masked_thermal) avgtemp = np.average(masked_thermal) mediantemp = np.median(masked_thermal) # Store data into outputs class outputs.add_observation(variable='max_temp', trait='maximum temperature', method='plantcv.plantcv.analyze_thermal_values', scale='degrees', datatype=float, value=maxtemp, label='degrees') outputs.add_observation(variable='min_temp', trait='minimum temperature', method='plantcv.plantcv.analyze_thermal_values', scale='degrees', datatype=float, value=mintemp, label='degrees') outputs.add_observation(variable='mean_temp', trait='mean temperature', method='plantcv.plantcv.analyze_thermal_values', scale='degrees', datatype=float, value=avgtemp, label='degrees') outputs.add_observation(variable='median_temp', trait='median temperature', method='plantcv.plantcv.analyze_thermal_values', scale='degrees', datatype=float, value=mediantemp, label='degrees') outputs.add_observation(variable='thermal_frequencies', trait='thermal frequencies', method='plantcv.plantcv.analyze_thermal_values', scale='frequency', datatype=list, value=hist_percent, label=bin_labels) analysis_img = None if histplot is True: params.device += 1 dataset = pd.DataFrame({ 'Temperature C': bin_labels, 'Proportion of pixels (%)': hist_percent }) fig_hist = (ggplot(data=dataset, mapping=aes(x='Temperature C', y='Proportion of pixels (%)')) + geom_line(color='green')) analysis_img = fig_hist if params.debug == "print": fig_hist.save(os.path.join( params.debug_outdir, str(params.device) + '_therm_histogram.png'), verbose=False) elif params.debug == "plot": print(fig_hist) return analysis_img
def test_no_missing_values(): p = (ggplot(df_missing, aes(x='x')) + geom_line(aes(y='y2'), size=2)) assert p == 'no_missing_values'
def test_annotation_stripes_coord_flip(): p = (ggplot(df) + annotation_stripes(fill_range='no') + geom_point(aes('factor(x)', 'y')) + geom_vline(xintercept=[0.5, 1.5, 2.5, 3.5]) + coord_flip()) assert p == 'annotation_stripes_coord_flip'
def test_annotation_stripes_single_stripe(): p = (ggplot(df.assign(x=10)) + annotation_stripes(fill=["#FF0000", "#00FF00"]) + geom_point(aes('factor(x)', 'y'))) assert p == 'annotation_stripes_single_stripe'
def test_linear_smooth_no_ci(): p = (ggplot(df_linear, aes('x')) + geom_point(aes(y='y_noisy')) + geom_smooth( aes(y='y_noisy'), method='lm', span=.3, color='blue', se=False)) assert p == 'linear_smooth_no_ci'
from plotnine.data import mpg from plotnine import ggplot, aes, facet_grid, labs, geom_point, stat_smooth print(ggplot(mpg) + facet_grid(facets="year~class") + aes(x="displ", y="hwy") + labs( x="Engine Size", y="Miles per Gallon", title="Miles per Gallon for Each Year and Vehicle Class") + geom_point() + stat_smooth(method='lm'))
def test_legend_fill_ratio(): p = (ggplot(df_linear, aes('x', color='x<0.5')) + geom_point(aes(y='y_noisy')) + geom_smooth(aes(y='y_noisy'), method='lm', size=0.5, span=.3)) assert p == 'legend_fill_ratio'
def test_lm_weights(self): p = (self.p + aes(weight='x.abs()') + stat_smooth( method='lm', formula='y ~ np.sin(x)', fill='red', se=True)) assert p == 'lm_formula_weights'
def test_sorts_by_x(): df = pd.DataFrame({'x': [5, 0, 1, 2, 3, 4], 'y': range(6)}) p = ggplot(df, aes('x', 'y')) + geom_smooth(stat='identity') assert p == 'sorts_by_x'
def test_mavg(self): p = self.p + geom_smooth( aes(y='y_noisy'), method='mavg', method_args={'window': 10}) p.draw_test()
def test_lowess(self): p = self.p + geom_smooth(aes(y='y_noisy'), method='lowess') with pytest.warns(PlotnineWarning): p.draw_test()
SDRsuper.dropna(inplace=True) SDRsub.dropna(inplace=True) # Add level column SDRsuper.insert(0, 'Level', 'super') SDRsub.insert(0, 'Level', 'sub') SDRall = pd.concat([SDRsub, SDRsuper]) #%% SDRsuper and SDRsub violin plot + boxplot + lines # ============================================================================= # Simple violin plot: # ============================================================================= (ggplot(SDRall) + aes(y='value', x='Level', fill='Level') + geom_violin(scale="width")) # ============================================================================= # Next level violin plots # ============================================================================= shift = 0.1 def alt_sign(x): "Alternate +1/-1 if x is even/odd" return (-1)**x m1 = aes(x=stage('Level', after_scale='x+shift*alt_sign(x)')) # shift outward
def test_non_linear_smooth(): p = (ggplot(df_linear, aes('x')) + geom_point(aes(y='y_noisy')) + geom_smooth(aes(y='y_noisy'), method='loess', span=.3, color='blue')) assert p == 'non_linear_smooth'
from plotnine.data import economics from plotnine import ggplot, aes, geom_line, labs g = (ggplot(economics) + aes(x="date", y="uempmed") + geom_line() + labs(x="date", y="median duration of unemployment")) print(g)
def test_discrete_x(): p = (ggplot(df_discrete_x, aes('x', 'y')) + geom_point() + geom_smooth(color='blue')) assert p == 'discrete_x'
def test_annotation_stripes_fill_range_cycle(): p = (ggplot(df) + annotation_stripes(fill_range='cycle') + geom_point(aes('factor(x)', 'y')) + geom_vline(xintercept=[0.5, 1.5, 2.5, 3.5])) assert p == 'annotation_stripes_fill_range_cycle'
#### Getting set up #### %matplotlib inline import plotnine as p9 import pandas as pd # read in filtered datasets birth_reduced = pd.read_csv("data/birth_reduced.csv") smoke_complete = pd.read_csv("data/smoke_complete.csv") #### create a simple ggplot #### # bind data to new plot # specify aesthetic: mapping data to plot # layers: ways (shapes) through which data are represented (p9.ggplot(data=smoke_complete, mapping=p9.aes(x="age_at_diagnosis", y="cigarettes_per_day")) + p9.geom_point() ) # ignore warnings (FutureWarning not fatal) import warnings warnings.simplefilter("ignore") # add new cell at top of notebook and re-execute plot to remove errors # Create object to hold plot framework smoke_plot = p9.ggplot(data=smoke_complete, mapping=p9.aes(x="age_at_diagnosis", y="cigarettes_per_day")) # Draw the plot smoke_plot + p9.geom_point()
def test_annotation_stripes_continuous_scale(): p = (ggplot(df) + annotation_stripes() + geom_point(aes('x', 'y')) + geom_vline(xintercept=[0.5, 1.5, 2.5, 3.5])) assert p == 'annotation_stripes_continuous_scale'
geom_bar, geom_text, geom_line, ggplot, ggtitle, theme, theme_classic, xlab, ylab, ) plt = ( ggplot( data=pr_curves_df, mapping=aes(x="recall", y="precision",), # "factor(species, ordered=False)", ) + geom_line() ) #%% plt2 = ( ggplot( data=pr_curves_df, mapping=aes( x="false_positive_rate", y="true_positives_ratio", ), # "factor(species, ordered=False)", ) + geom_line() )
def test_discrete_x_fullrange(): p = (ggplot(df_discrete_x, aes('x', 'y')) + geom_point() + geom_smooth(color='blue', fullrange=True)) assert p == 'discrete_x_fullrange'
from plotnine.data import mpg from plotnine import ggplot, aes, geom_bar print(ggplot(mpg) + aes(x="class") + geom_bar())
# res = aci.groupby(["plot"], as_index=False).apply(check_dates, site_data) # res = res.groupby(["site", "julian"], as_index=False).agg({"ACI": ["mean", "std"], "lat": "mean", "lon": "mean"}) # res.columns = pd.Index(join_tuple(i, "_") for i in res.columns) # print(aci.loc[aci["site"] == "Igloolik"]) # print(aci) res # res.to_feather("data_glm.feather") def label_x(dates): res = [(datetime.datetime(2018, 1, 1) + datetime.timedelta(x)).strftime("%d-%m") for x in dates] print(res) return res (ggplot(data=res, mapping=aes(x='julian', y='value', colour='type')) + xlab("Day") + ylab("Mean number of detected songs") + facet_grid("type~", scales="free") + # + geom_line() # + facet_wrap("type", nrow=2, ncol=1) geom_point() # + geom_errorbar(aes(ymin="ACI_mean - ACI_std", ymax="ACI_mean + ACI_std")) + geom_smooth(method="mavg", se=False, method_args={"window": 4, "center": True, "min_periods": 1}) + scale_colour_manual(values=cbbPalette, guide=False) + scale_x_continuous(labels=label_x)).save("figs/song_events_aci_BARROW_mean_smoothed2.png", height=10, width=16, dpi=150) (ggplot(data=res, mapping=aes(x='julian', y='n_events_sum', colour='site')) + xlab("Day") + ylab("Total number of detected songs") + # + facet_grid("site~", scales="free")
def main(): """Run CLI.""" parser = argparse.ArgumentParser(description=""" Read AnnData object and PCs file. Clusters the data. Saves an AnnData object with clusters in the clusters slot, a clusters file, and QC plots. """) parser.add_argument( '-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) parser.add_argument('-h5', '--h5_anndata', action='store', dest='h5', required=True, help='H5 AnnData file.') parser.add_argument( '-pc', '--tsv_pcs', action='store', dest='pc', default='', help='Tab-delimited file of PCs for each cell. First column is\ cell_barcode. Subsequent columns are PCs. If "", uses pca\ slot in AnnData.\ (default: "")') parser.add_argument( '-cm', '--cluster_method', action='store', dest='cm', default='leiden', help='Clustering method. Valid options: [leiden|louvain].\ (default: %(default)s)') parser.add_argument('-npc', '--number_pcs', action='store', dest='npc', default=0, type=int, help='Number of PCs to use.\ (default: maximum number in tsv_pcs file)') parser.add_argument('-r', '--resolution', action='store', dest='r', default=1.0, type=float, help='Resolution.\ (default: %(default)s)') parser.add_argument( '-nn', '--number_neighbors', action='store', dest='number_neighbors', default=25, type=int, help='Number of neighbors. If <= 0, sets to the number of unique\ "experiment_id".\ (default: %(default)s)') parser.add_argument( '--force_recalculate_neighbors', action='store_true', dest='calculate_neighbors', default=False, help='Calculate neighbor graph even if it already exists in the\ AnnData (which it my do so if you already ran BBKNN).\ (default: %(default)s)') parser.add_argument('-ncpu', '--number_cpu', action='store', dest='ncpu', default=4, type=int, help='Number of CPUs to use.\ (default: %(default)s)') parser.add_argument( '-of', '--output_file', action='store', dest='of', default='', help='Basename of output files, assuming output in current working \ directory.\ (default: <h5_anndata>-<tsv_pcs>-clustered)') options = parser.parse_args() # Fixed settings. verbose = True # Scanpy settings sc.settings.figdir = os.getcwd() # figure output directory to match base. sc.settings.n_jobs = options.ncpu # number CPUs # sc.settings.max_memory = 500 # in Gb # sc.set_figure_params(dpi_save = 300) # Get the out file base. out_file_base = options.of if out_file_base == '': out_file_base = '{}-{}-clustered'.format( os.path.basename(options.h5.rstrip('h5ad').rstrip('.')), os.path.basename(options.pc.rstrip('tsv.gz').rstrip('.'))) # Load the AnnData file. adata = sc.read_h5ad(filename=options.h5) # Load the PCs. if options.pc == '': df_pca = pd.DataFrame( data=adata.obsm['X_pca'], index=adata.obs.index, columns=[ 'PC{}'.format(x) for x in range(1, adata.obsm['X_pca'].shape[1] + 1) ]) else: df_pca = pd.read_csv(options.pc, sep='\t', index_col='cell_barcode') # Check that nPCs is valid. n_pcs = options.npc if n_pcs == 0: n_pcs = len(df_pca.columns) elif n_pcs > len(df_pca.columns): raise Exception( '--number_pcs ({}) is > than n_pcs in --tsv_pcs ({}).'.format( n_pcs, len(df_pca.columns))) if verbose: print('Using {} PCs.'.format(n_pcs)) # Add the reduced dimensions to the AnnData object. adata.obsm['X_pca'] = df_pca.loc[adata.obs.index, :].values.copy() # Check if BBKNN # Calculate neighbors for on the specified PCs. # By default saved to adata.uns['neighbors'] # # First, however, check to see if adata.uns['neighbors'] already exists # ...and unless the user tells us not to, use that slot, not calculating # neighbors. This default behaviour is to accommodate the instance when # bbknn has been run on the data. if 'neighbors' not in adata.uns or options.calculate_neighbors: number_neighbors = options.number_neighbors if number_neighbors <= 0: number_neighbors = len(adata.obs['experiment_id'].cat.categories) sc.pp.neighbors(adata, use_rep='X_pca', n_neighbors=options.number_neighbors, n_pcs=n_pcs, copy=False, random_state=0) else: warnings.warn('WARNING: found neighbors slot in adata.uns. {}'.format( 'Not calculating neighbors (ignoring n_neighbors parameter).')) # If we are using the pre-calculated neighbors drop npcs note. if 'n_pcs' in adata.uns['neighbors']['params']: n_pcs = adata.uns['neighbors']['params']['n_pcs'] # Run the clustering, choosing either leiden or louvain algorithms cluster_method = options.cm cluster_resolution = options.r if cluster_method == 'leiden': sc.tl.leiden(adata, resolution=cluster_resolution, key_added=cluster_method, copy=False, random_state=0) elif cluster_method == 'louvain': sc.tl.louvain(adata, flavor='vtraag', resolution=cluster_resolution, key_added=cluster_method, copy=False, random_state=0) else: raise Exception('Invalid --cluster_method: {}.'.format(cluster_method)) # Also save the clusters to the same spot so we know where they will be. adata.uns['cluster'] = adata.uns[cluster_method] adata.uns['cluster']['params']['method'] = cluster_method adata.obs['cluster'] = adata.obs[cluster_method] # Print the final number of clustered discrovered if verbose: print('{} clusters identified'.format( adata.obs[cluster_method].drop_duplicates().shape[0])) # Save the clustered data to a data frame. cell_clustering_df = adata.obs[[cluster_method]].copy() cell_clustering_df.columns = ['cluster'] cell_clustering_df['cluster_method'] = cluster_method cell_clustering_df['cluster_resolution'] = cluster_resolution cell_clustering_df.to_csv('{}.tsv.gz'.format(out_file_base), sep='\t', index=True, quoting=csv.QUOTE_NONNUMERIC, index_label='cell_barcode', na_rep='', compression='gzip') adata.write('{}.h5ad'.format(out_file_base), compression='gzip') # Save dotplot of number of cells for each sample in each cluster df = adata.obs[['experiment_id', 'cluster']] df = df.groupby(['cluster', 'experiment_id']).size().reset_index(name='nr_cells') gplt = plt9.ggplot(df, plt9.aes(x='experiment_id', y='cluster')) gplt = gplt + plt9.geom_point(plt9.aes(size='nr_cells', color='nr_cells')) gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=90)) gplt.save('dotplot_sample-{}.png'.format(out_file_base), dpi=300, width=4, height=4)
def test_missing_values(): p = (ggplot(df_missing, aes(x='x')) + geom_line(aes(y='y1'), size=2)) with pytest.warns(UserWarning): assert p == 'missing_values'
combi_miner = Modl( X, multiple_overlap_max_number_of_combinations=DEFAULT_QUERIED_ATOMS, nb_threads=N_THREADS) modl_interesting_combis = combi_miner.find_interesting_combinations() stop_time = time.time() df_bench = df_bench.append( { 'nb_sets': size, 'time': stop_time - start_time }, ignore_index=True) df_bench['nb_sets'] = df_bench['nb_sets'].astype(int) p = (ggplot(df_bench) + aes('nb_sets', 'time') + geom_point() + geom_smooth(span=.3) + scale_x_continuous() + xlab("Number of sets") + ylab("Time (seconds)")) p.save(filename=OUTPUT_ROOT + "scaling_fig1") ## Number of queried words df_bench = pd.DataFrame(columns=['step', 'time']) X = test_data_for_modl(nflags=NFLAGS, number_of_sets=DEFAULT_N_SETS, noise=NOISE) for _ in REPEATS: for step in STEPS: start_time = time.time()
def test_aes_inheritance(): with pytest.raises(PlotnineError): p = (ggplot(df, aes('x', 'y', yintercept='yintercept')) + geom_point() + geom_hline(size=2)) p.draw_test()
def test_aes_overwrite(): with pytest.warns(PlotnineWarning): geom_hline(aes(color='y'), yintercept=2)
def test_gls(self): p = self.p + geom_smooth(aes(y='y_noisy'), method='gls') p.draw_test()
def plot_portfolio(portfolio_df, figure_size=(12, 4), line_size=1.5, date_text_size=7): """ Given a daily snapshot of virtual purchases plot both overall and per-stock performance. Return a tuple of figures representing the performance as inline data. """ assert portfolio_df is not None #print(portfolio_df) portfolio_df["date"] = pd.to_datetime(portfolio_df["date"]) avg_profit_over_period = ( portfolio_df.filter(items=["stock", "stock_profit"]).groupby("stock").mean() ) avg_profit_over_period["contribution"] = [ "positive" if profit >= 0.0 else "negative" for profit in avg_profit_over_period.stock_profit ] # dont want to override actual profit with average avg_profit_over_period = avg_profit_over_period.drop("stock_profit", axis="columns") portfolio_df = portfolio_df.merge( avg_profit_over_period, left_on="stock", right_index=True, how="inner" ) # print(portfolio_df) # 1. overall performance df = portfolio_df.filter( items=["portfolio_cost", "portfolio_worth", "portfolio_profit", "date"] ) df = df.melt(id_vars=["date"], var_name="field") plot = ( p9.ggplot(df, p9.aes("date", "value", group="field", color="field")) + p9.labs(x="", y="$ AUD") + p9.geom_line(size=1.5) + p9.facet_wrap("~ field", nrow=3, ncol=1, scales="free_y") + p9.theme( axis_text_x=p9.element_text(angle=30, size=date_text_size), figure_size=figure_size, legend_position="none", ) ) overall_figure = plot_as_inline_html_data(plot) df = portfolio_df.filter( items=["stock", "date", "stock_profit", "stock_worth", "contribution"] ) melted_df = df.melt(id_vars=["date", "stock", "contribution"], var_name="field") all_dates = sorted(melted_df["date"].unique()) df = melted_df[melted_df["date"] == all_dates[-1]] df = df[df["field"] == "stock_profit"] # only latest profit is plotted df["contribution"] = [ "positive" if profit >= 0.0 else "negative" for profit in df["value"] ] # 2. plot contributors ie. winners and losers plot = ( p9.ggplot(df, p9.aes("stock", "value", fill="stock")) + p9.geom_bar(stat="identity") + p9.labs(x="", y="$ AUD") + p9.facet_grid("contribution ~ field", scales="free_y") + p9.theme(legend_position="none", figure_size=figure_size) ) profit_contributors = plot_as_inline_html_data(plot) # 3. per purchased stock performance plot = ( p9.ggplot(melted_df, p9.aes("date", "value", group="stock", colour="stock")) + p9.xlab("") + p9.geom_line(size=1.0) + p9.facet_grid("field ~ contribution", scales="free_y") + p9.theme( axis_text_x=p9.element_text(angle=30, size=date_text_size), figure_size=figure_size, panel_spacing=0.5, # more space between plots to avoid tick mark overlap subplots_adjust={"right": 0.8}, ) ) stock_figure = plot_as_inline_html_data(plot) return overall_figure, stock_figure, profit_contributors