コード例 #1
0
    def process_1d_histogram(self, name, hist):
        """Create statistics of and plot input 1d histogram.

        :param str name: name of the histogram
        :param hist: input histogram object
        """
        # datatype properties
        datatype = hist.datatype
        col_props = statistics.get_col_props(datatype)
        is_num = col_props['is_num']
        is_ts = col_props['is_ts']

        # skip empty histograms
        n_bins = hist.n_bins
        if n_bins == 0:
            self.logger.warning('Histogram "{name}" is empty; skipping.', name=name)
            return

        bin_labels = hist.bin_centers() if is_num else hist.bin_labels()
        bin_counts = hist.bin_entries()
        bin_edges = hist.bin_edges() if is_num else None

        if is_ts:
            to_timestamp = np.vectorize(lambda x: pd.Timestamp(x))
            bin_labels = to_timestamp(bin_labels)
            bin_edges = to_timestamp(bin_edges)

        # create statistics object for histogram
        var_label = self.var_labels.get(name, name)
        stats = statistics.ArrayStats(bin_labels, name, weights=bin_counts, unit=self.var_units.get(name, ''),
                                      label=var_label)
        # evaluate statitical properties of array
        stats.create_stats()

        # make nice plots here ...
        # for numbers and timestamps, make cropped histogram, between percentiles 5-95%
        # ... and project on existing binning.
        # for categories, accept top N number of categories in bins.
        # NB: bin_edges overrules var_bins (if it is not none)
        nphist = stats.make_histogram(var_bins=self.var_bins.get(name, NUMBER_OF_BINS), bin_edges=bin_edges)

        # determine histogram properties for plotting below
        x_label = stats.get_x_label()
        y_label = self.hist_y_label if self.hist_y_label else None
        hist_file_name = 'hist_{}.pdf'.format(name.replace(' ', '_'))
        pdf_file_name = '{0:s}/{1:s}'.format(self.results_path, hist_file_name)

        # matplotlib plot of histogram
        visualization.vis_utils.plot_histogram(nphist, x_label=x_label, y_label=y_label, is_num=is_num, is_ts=is_ts,
                                               pdf_file_name=pdf_file_name)

        # create overview table of histogram statistics
        stats_table = stats.get_latex_table()

        # create page string
        page_templ = self.page_template
        for kv in [('VAR_LABEL', var_label), ('VAR_STATS_TABLE', stats_table), ('VAR_HISTOGRAM_PATH', hist_file_name)]:
            page_templ = page_templ.replace(*kv)
        self.pages.append(page_templ)
コード例 #2
0
ファイル: df_summary.py プロジェクト: Patechoc/Eskapade
    def process_series(self, col, sample):
        """Create statistics of and plot input pandas series

        :param str col: name of the series
        :param sample: input pandas series object
        """

        # skip columns consisting entirely of nans
        nan_cnt = sample.isnull().sum()
        self.nan_counts.append(nan_cnt)
        if nan_cnt == len(sample.index):
            self.log().debug('Column "%s" consists of nans only; skipping',
                             col)
            return

        # 1. create statistics object for column
        var_label = self.var_labels.get(col, col)
        stats = statistics.ArrayStats(sample,
                                      col,
                                      unit=self.var_units.get(col, ''),
                                      label=var_label)
        # evaluate statitical properties of array
        stats.create_stats()

        # make histogram
        nphist = stats.make_histogram(
            var_bins=self.var_bins.get(col, NUMBER_OF_BINS))

        # determine histogram properties for plotting
        x_label = stats.get_x_label()
        y_label = self.hist_y_label if self.hist_y_label else None
        is_num = stats.get_col_props()['is_num']
        is_ts = stats.get_col_props()['is_ts']
        hist_file_name = 'hist_{}.pdf'.format(col)
        pdf_file_name = '{0:s}/{1:s}'.format(self.results_path, hist_file_name)

        # 3. plot histogram of column variable
        visualization.vis_utils.plot_histogram(nphist,
                                               x_label=x_label,
                                               y_label=y_label,
                                               is_num=is_num,
                                               is_ts=is_ts,
                                               pdf_file_name=pdf_file_name)

        # create overview table of column variable
        stats_table = stats.get_latex_table()

        # create page string for report
        self.pages.append(
            self.page_template.replace('VAR_LABEL', var_label).replace(
                'VAR_STATS_TABLE',
                stats_table).replace('VAR_HISTOGRAM_PATH', hist_file_name))
コード例 #3
0
        def update_table(value_x):
            if value_x is None:

                stats = statistics.ArrayStats(ds[self.read_key].iloc[:,0].values,
                                              value_x,
                                              label='fake')
                stats.create_stats()
                input_stats = pd.DataFrame.from_dict(stats.stat_vals, orient='columns').T.drop(1, 1)
                input_stats *= 0
                input_stats.reset_index(inplace=True)

                input_stats.columns = ['Variable', 'Value']
                return input_stats.to_dict('rows')
            elif value_x != 0:
                stats = statistics.ArrayStats(ds[self.read_key].loc[:, value_x].values,
                                              value_x,
                                              label=ds[self.label_key][value_x])
                stats.create_stats()
                input_stats = pd.DataFrame.from_dict(stats.stat_vals, orient='columns').T.drop(1, 1)
                input_stats.reset_index(inplace=True)
                input_stats.columns = ['Variable', 'Value']
                return input_stats.to_dict('rows')
コード例 #4
0
    def execute(self):
        """Execute the link.

        :returns: status code of execution
        :rtype: StatusCode
        """
        settings = process_manager.service(ConfigObject)
        ds = process_manager.service(DataStore)

        ds[self.col_key] = ds[self.read_key].columns.values
        base_path = os.path.join(os.path.dirname(__file__), '../../../data/'+self.label_key)
        ds[self.label_key] = pickle.load(open(base_path, 'rb'))

        ds[self.hue_key] = self.hue_cols

        stats = statistics.ArrayStats(ds[self.read_key].iloc[:, 0].values,
                                      ds[self.read_key].columns[0],
                                      label=ds[self.label_key][ds[self.read_key].columns[0]])
        stats.create_stats()
        input_stats = pd.DataFrame.from_dict(stats.stat_vals, orient='columns').T.drop(1, 1)*0
        input_stats.reset_index(inplace=True)
        input_stats.columns = ['Variable', 'Value']
        self.initial_stats = input_stats

        # --- your algorithm code goes here
        self.logger.debug('Now executing link: {link}.', link=self.name)

        print('-'*20, '!!!')
        print(os.path.abspath(__file__))

        app = dash.Dash(__name__, assets_folder=os.path.join(os.path.dirname(__file__), '../../../macros/assets/'),)

        app.layout = html.Div([
            html.H1('DataFrames: a summary'),
            html.Div([  # -- the row
                html.Div([  # -- first column
                    html.H5("Variable"),
                    dcc.Dropdown(id='x_dropdown',
                                 options=[{'label': ds[self.label_key][x],
                                           'value': x} for x in ds[self.col_key]],
                                 value=0, placeholder='Select...',
                                 style={'width': '100%'}),
                    html.H5("Hue"),
                    dcc.Dropdown(id='hue_dropdown',
                                 options=[{'label': ds[self.label_key][x],
                                           'value': x} for x in ds[self.hue_key]],
                                 value=0, placeholder='Select...',
                                 style={'width': '100%'}), ], className='two columns'),
                html.Div([  # -- second column
                    html.Div([
                        dcc.Graph(id='histogram',
                                  figure={'layout': go.Layout(plot_bgcolor=self.plt_bgcolor,
                                                              paper_bgcolor=self.plt_papercolor,
                                                              font=dict(color=self.text_color),
                                                              title='Histogram'
                                                              )}),
                        dcc.Slider(id='bin_slider1',
                                   min=1,
                                   max=100,
                                   step=1,
                                   value=30,
                                   updatemode='drag',
                                   marks={x: {'label': str(x)} for x in range(0, 100, 10)}),
                    ])], className='six columns'),
                html.Div([  # --  third column
                    html.Div([
                        dash_table.DataTable(id='table',
                                             columns=[{"name": i, "id": i} for i in self.initial_stats],
                                             data=self.initial_stats.to_dict('rows'),
                                             style_as_list_view=True,
                                             style_header={
                                                 'backgroundColor': self.plt_papercolor,
                                                 'color': 'white', },
                                             style_cell={
                                                 'backgroundColor': self.plt_bgcolor,
                                                 'color': 'white'}, )

                    ],
                        className='offset-by-one columns', style={'width': '20%', })]
                ),
            ], className='row'),
        ])

        @app.callback(dash.dependencies.Output('histogram', 'figure'),
                      [dash.dependencies.Input('x_dropdown', 'value'),
                       dash.dependencies.Input('hue_dropdown', 'value'),
                       dash.dependencies.Input('bin_slider1', 'value')])
        def update_plot(value_x, hue, bins):
            import seaborn as sns
            if hue is None:
                hue = 0
            if value_x is None:
                value_x = 0

            df = ds[self.read_key]
            if value_x == 0:
                return {'data': [],
                        'layout':go.Layout(
                                xaxis={'title': 'Please select a variable'},
                                plot_bgcolor=self.plt_bgcolor,
                                paper_bgcolor=self.plt_papercolor,
                                font=dict(color=self.text_color),
                                title='Histogram')}
            elif hue != 0:
                pal = sns.palettes.color_palette('YlGnBu', n_colors=len(df[hue].unique()))
                pal = pal.as_hex()
                return {'data': [go.Histogram(
                    x=df.loc[df[hue] == col, value_x],
                    nbinsx=bins,
                    name=str(col),
                    marker=dict(color=pal[i]),
                    text=df[value_x])
                    for i, col in enumerate(df[hue].unique())],
                    'layout': go.Layout(
                        xaxis={'title': ds[self.label_key][value_x]},
                        plot_bgcolor=self.plt_bgcolor,
                        paper_bgcolor=self.plt_papercolor,
                        font=dict(color=self.text_color),
                        title='Histogram'),}
            else:
                return {'data': [go.Histogram(
                    x=df[value_x],
                    nbinsx=bins,
                    text=df[value_x])],
                    'layout': go.Layout(
                        xaxis={'title': ds[self.label_key][value_x]},
                        plot_bgcolor=self.plt_bgcolor,
                        paper_bgcolor=self.plt_papercolor,
                        font=dict(color=self.text_color),
                        title='Histogram'
                    )}

        @app.callback(dash.dependencies.Output('table', 'data'),
                      [dash.dependencies.Input('x_dropdown', 'value')])
        def update_table(value_x):
            if value_x is None:

                stats = statistics.ArrayStats(ds[self.read_key].iloc[:,0].values,
                                              value_x,
                                              label='fake')
                stats.create_stats()
                input_stats = pd.DataFrame.from_dict(stats.stat_vals, orient='columns').T.drop(1, 1)
                input_stats *= 0
                input_stats.reset_index(inplace=True)

                input_stats.columns = ['Variable', 'Value']
                return input_stats.to_dict('rows')
            elif value_x != 0:
                stats = statistics.ArrayStats(ds[self.read_key].loc[:, value_x].values,
                                              value_x,
                                              label=ds[self.label_key][value_x])
                stats.create_stats()
                input_stats = pd.DataFrame.from_dict(stats.stat_vals, orient='columns').T.drop(1, 1)
                input_stats.reset_index(inplace=True)
                input_stats.columns = ['Variable', 'Value']
                return input_stats.to_dict('rows')


                # return input_stats.describe().to_dict('rows')

        ds[self.app_store_key] = app
        return StatusCode.Success