def process_1d_histogram(self, name, hist): """Create statistics of and plot input 1d histogram. :param str name: name of the histogram :param hist: input histogram object """ # datatype properties datatype = hist.datatype col_props = statistics.get_col_props(datatype) is_num = col_props['is_num'] is_ts = col_props['is_ts'] # skip empty histograms n_bins = hist.n_bins if n_bins == 0: self.logger.warning('Histogram "{name}" is empty; skipping.', name=name) return bin_labels = hist.bin_centers() if is_num else hist.bin_labels() bin_counts = hist.bin_entries() bin_edges = hist.bin_edges() if is_num else None if is_ts: to_timestamp = np.vectorize(lambda x: pd.Timestamp(x)) bin_labels = to_timestamp(bin_labels) bin_edges = to_timestamp(bin_edges) # create statistics object for histogram var_label = self.var_labels.get(name, name) stats = statistics.ArrayStats(bin_labels, name, weights=bin_counts, unit=self.var_units.get(name, ''), label=var_label) # evaluate statitical properties of array stats.create_stats() # make nice plots here ... # for numbers and timestamps, make cropped histogram, between percentiles 5-95% # ... and project on existing binning. # for categories, accept top N number of categories in bins. # NB: bin_edges overrules var_bins (if it is not none) nphist = stats.make_histogram(var_bins=self.var_bins.get(name, NUMBER_OF_BINS), bin_edges=bin_edges) # determine histogram properties for plotting below x_label = stats.get_x_label() y_label = self.hist_y_label if self.hist_y_label else None hist_file_name = 'hist_{}.pdf'.format(name.replace(' ', '_')) pdf_file_name = '{0:s}/{1:s}'.format(self.results_path, hist_file_name) # matplotlib plot of histogram visualization.vis_utils.plot_histogram(nphist, x_label=x_label, y_label=y_label, is_num=is_num, is_ts=is_ts, pdf_file_name=pdf_file_name) # create overview table of histogram statistics stats_table = stats.get_latex_table() # create page string page_templ = self.page_template for kv in [('VAR_LABEL', var_label), ('VAR_STATS_TABLE', stats_table), ('VAR_HISTOGRAM_PATH', hist_file_name)]: page_templ = page_templ.replace(*kv) self.pages.append(page_templ)
def process_series(self, col, sample): """Create statistics of and plot input pandas series :param str col: name of the series :param sample: input pandas series object """ # skip columns consisting entirely of nans nan_cnt = sample.isnull().sum() self.nan_counts.append(nan_cnt) if nan_cnt == len(sample.index): self.log().debug('Column "%s" consists of nans only; skipping', col) return # 1. create statistics object for column var_label = self.var_labels.get(col, col) stats = statistics.ArrayStats(sample, col, unit=self.var_units.get(col, ''), label=var_label) # evaluate statitical properties of array stats.create_stats() # make histogram nphist = stats.make_histogram( var_bins=self.var_bins.get(col, NUMBER_OF_BINS)) # determine histogram properties for plotting x_label = stats.get_x_label() y_label = self.hist_y_label if self.hist_y_label else None is_num = stats.get_col_props()['is_num'] is_ts = stats.get_col_props()['is_ts'] hist_file_name = 'hist_{}.pdf'.format(col) pdf_file_name = '{0:s}/{1:s}'.format(self.results_path, hist_file_name) # 3. plot histogram of column variable visualization.vis_utils.plot_histogram(nphist, x_label=x_label, y_label=y_label, is_num=is_num, is_ts=is_ts, pdf_file_name=pdf_file_name) # create overview table of column variable stats_table = stats.get_latex_table() # create page string for report self.pages.append( self.page_template.replace('VAR_LABEL', var_label).replace( 'VAR_STATS_TABLE', stats_table).replace('VAR_HISTOGRAM_PATH', hist_file_name))
def update_table(value_x): if value_x is None: stats = statistics.ArrayStats(ds[self.read_key].iloc[:,0].values, value_x, label='fake') stats.create_stats() input_stats = pd.DataFrame.from_dict(stats.stat_vals, orient='columns').T.drop(1, 1) input_stats *= 0 input_stats.reset_index(inplace=True) input_stats.columns = ['Variable', 'Value'] return input_stats.to_dict('rows') elif value_x != 0: stats = statistics.ArrayStats(ds[self.read_key].loc[:, value_x].values, value_x, label=ds[self.label_key][value_x]) stats.create_stats() input_stats = pd.DataFrame.from_dict(stats.stat_vals, orient='columns').T.drop(1, 1) input_stats.reset_index(inplace=True) input_stats.columns = ['Variable', 'Value'] return input_stats.to_dict('rows')
def execute(self): """Execute the link. :returns: status code of execution :rtype: StatusCode """ settings = process_manager.service(ConfigObject) ds = process_manager.service(DataStore) ds[self.col_key] = ds[self.read_key].columns.values base_path = os.path.join(os.path.dirname(__file__), '../../../data/'+self.label_key) ds[self.label_key] = pickle.load(open(base_path, 'rb')) ds[self.hue_key] = self.hue_cols stats = statistics.ArrayStats(ds[self.read_key].iloc[:, 0].values, ds[self.read_key].columns[0], label=ds[self.label_key][ds[self.read_key].columns[0]]) stats.create_stats() input_stats = pd.DataFrame.from_dict(stats.stat_vals, orient='columns').T.drop(1, 1)*0 input_stats.reset_index(inplace=True) input_stats.columns = ['Variable', 'Value'] self.initial_stats = input_stats # --- your algorithm code goes here self.logger.debug('Now executing link: {link}.', link=self.name) print('-'*20, '!!!') print(os.path.abspath(__file__)) app = dash.Dash(__name__, assets_folder=os.path.join(os.path.dirname(__file__), '../../../macros/assets/'),) app.layout = html.Div([ html.H1('DataFrames: a summary'), html.Div([ # -- the row html.Div([ # -- first column html.H5("Variable"), dcc.Dropdown(id='x_dropdown', options=[{'label': ds[self.label_key][x], 'value': x} for x in ds[self.col_key]], value=0, placeholder='Select...', style={'width': '100%'}), html.H5("Hue"), dcc.Dropdown(id='hue_dropdown', options=[{'label': ds[self.label_key][x], 'value': x} for x in ds[self.hue_key]], value=0, placeholder='Select...', style={'width': '100%'}), ], className='two columns'), html.Div([ # -- second column html.Div([ dcc.Graph(id='histogram', figure={'layout': go.Layout(plot_bgcolor=self.plt_bgcolor, paper_bgcolor=self.plt_papercolor, font=dict(color=self.text_color), title='Histogram' )}), dcc.Slider(id='bin_slider1', min=1, max=100, step=1, value=30, updatemode='drag', marks={x: {'label': str(x)} for x in range(0, 100, 10)}), ])], className='six columns'), html.Div([ # -- third column html.Div([ dash_table.DataTable(id='table', columns=[{"name": i, "id": i} for i in self.initial_stats], data=self.initial_stats.to_dict('rows'), style_as_list_view=True, style_header={ 'backgroundColor': self.plt_papercolor, 'color': 'white', }, style_cell={ 'backgroundColor': self.plt_bgcolor, 'color': 'white'}, ) ], className='offset-by-one columns', style={'width': '20%', })] ), ], className='row'), ]) @app.callback(dash.dependencies.Output('histogram', 'figure'), [dash.dependencies.Input('x_dropdown', 'value'), dash.dependencies.Input('hue_dropdown', 'value'), dash.dependencies.Input('bin_slider1', 'value')]) def update_plot(value_x, hue, bins): import seaborn as sns if hue is None: hue = 0 if value_x is None: value_x = 0 df = ds[self.read_key] if value_x == 0: return {'data': [], 'layout':go.Layout( xaxis={'title': 'Please select a variable'}, plot_bgcolor=self.plt_bgcolor, paper_bgcolor=self.plt_papercolor, font=dict(color=self.text_color), title='Histogram')} elif hue != 0: pal = sns.palettes.color_palette('YlGnBu', n_colors=len(df[hue].unique())) pal = pal.as_hex() return {'data': [go.Histogram( x=df.loc[df[hue] == col, value_x], nbinsx=bins, name=str(col), marker=dict(color=pal[i]), text=df[value_x]) for i, col in enumerate(df[hue].unique())], 'layout': go.Layout( xaxis={'title': ds[self.label_key][value_x]}, plot_bgcolor=self.plt_bgcolor, paper_bgcolor=self.plt_papercolor, font=dict(color=self.text_color), title='Histogram'),} else: return {'data': [go.Histogram( x=df[value_x], nbinsx=bins, text=df[value_x])], 'layout': go.Layout( xaxis={'title': ds[self.label_key][value_x]}, plot_bgcolor=self.plt_bgcolor, paper_bgcolor=self.plt_papercolor, font=dict(color=self.text_color), title='Histogram' )} @app.callback(dash.dependencies.Output('table', 'data'), [dash.dependencies.Input('x_dropdown', 'value')]) def update_table(value_x): if value_x is None: stats = statistics.ArrayStats(ds[self.read_key].iloc[:,0].values, value_x, label='fake') stats.create_stats() input_stats = pd.DataFrame.from_dict(stats.stat_vals, orient='columns').T.drop(1, 1) input_stats *= 0 input_stats.reset_index(inplace=True) input_stats.columns = ['Variable', 'Value'] return input_stats.to_dict('rows') elif value_x != 0: stats = statistics.ArrayStats(ds[self.read_key].loc[:, value_x].values, value_x, label=ds[self.label_key][value_x]) stats.create_stats() input_stats = pd.DataFrame.from_dict(stats.stat_vals, orient='columns').T.drop(1, 1) input_stats.reset_index(inplace=True) input_stats.columns = ['Variable', 'Value'] return input_stats.to_dict('rows') # return input_stats.describe().to_dict('rows') ds[self.app_store_key] = app return StatusCode.Success