Ejemplo n.º 1
0
def get_histogram(data_id):
    """
    :class:`flask:flask.Flask` route which returns output from numpy.histogram to front-end as JSON

    :param data_id: integer string identifier for a D-Tale process's data
    :type data_id: str
    :param col: string from flask.request.args['col'] containing name of a column in your dataframe
    :param query: string from flask.request.args['query'] which is applied to DATA using the query() function
    :param bins: the number of bins to display in your histogram, options on the front-end are 5, 10, 20, 50
    :returns: JSON {results: DATA, desc: output from pd.DataFrame[col].describe(), success: True/False}
    """
    col = get_str_arg(request, 'col', 'values')
    query = get_str_arg(request, 'query')
    bins = get_int_arg(request, 'bins', 20)
    try:
        data = DATA[data_id]
        if query:
            data = data.query(query)

        selected_col = find_selected_column(data, col)
        data = data[~pd.isnull(data[selected_col])][[selected_col]]
        hist = np.histogram(data, bins=bins)

        desc = load_describe(data[selected_col])
        return jsonify(data=[json_float(h) for h in hist[0]], labels=['{0:.1f}'.format(l) for l in hist[1]], desc=desc)
    except BaseException as e:
        return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
Ejemplo n.º 2
0
def build_histogram(data_id, col, query, point_filter):
    data = run_query(
        handle_predefined(data_id),
        query,
        global_state.get_context_variables(data_id),
    )
    query, _ = build_group_inputs_filter(data, [point_filter])
    data = run_query(data, query)
    s = data[~pd.isnull(data[col])][col]
    hist_data, hist_labels = np.histogram(s, bins=10)
    hist_labels = list(
        map(lambda x: json_float(x, precision=3), hist_labels[1:]))
    axes_builder = build_axes(
        dict(
            data=dict(all=dict(Frequency=hist_data, Bins=hist_labels)),
            min=dict(Frequency=0),
            max=dict(Frequency=max(hist_data)),
        ),
        "Bins",
        dict(type="single", data={}),
    )
    hist_data = dict(data={"all": dict(x=hist_labels, Frequency=hist_data)})
    bars = bar_builder(
        hist_data,
        "Bins",
        ["Frequency"],
        axes_builder,
        chart_builder_passthru,
        modal=True,
    )
    bars.figure["layout"]["xaxis"]["type"] = "category"
    bars.figure["layout"]["title"]["text"] = "{} {} ({} {})".format(
        text("Histogram of"), col, len(s), text("data points"))
    return bars
Ejemplo n.º 3
0
    def build(self, parent):
        if parent.classifier == "D":
            parent.data.loc[:, parent.selected_col] = apply(
                parent.data[parent.selected_col], json_timestamp)
        hist_data, hist_labels = np.histogram(parent.data[parent.selected_col],
                                              bins=self.bins)
        hist_data = [json_float(h) for h in hist_data]
        return_data = dict(
            labels=["{0:.1f}".format(lbl) for lbl in hist_labels[1:]
                    ],  # drop the first bin because of just a minimum
            data=hist_data,
        )
        kde, kde_code = build_kde(parent.data[parent.selected_col],
                                  hist_labels, parent.selected_col)
        if kde is not None:
            return_data["kde"] = kde
        desc, desc_code = load_describe(parent.data[parent.selected_col])
        dtype_info = global_state.get_dtype_info(parent.data_id,
                                                 parent.selected_col)
        for p in ["skew", "kurt"]:
            if p in dtype_info:
                desc[p] = dtype_info[p]

        return_data["desc"] = desc
        return return_data, self._build_code(parent, kde_code, desc_code)
Ejemplo n.º 4
0
 def build_histogram_data(self, series):
     hist_data, hist_labels = np.histogram(series, bins=self.bins)
     hist_data = [json_float(h) for h in hist_data]
     return (
         dict(
             labels=["{0:.1f}".format(lbl) for lbl in hist_labels[1:]
                     ],  # drop the first bin because of just a minimum
             data=hist_data,
         ),
         hist_labels,
     )
Ejemplo n.º 5
0
def build_kde(s, hist_labels, selected_col):
    try:
        kde = sts.gaussian_kde(s)
        kde_data = kde.pdf(hist_labels)
        kde_data = [json_float(k, precision=12) for k in kde_data]
        code = [
            "import scipy.stats as sts",
            "kde = sts.gaussian_kde(s['{}'])".format(selected_col),
            "kde_data = kde.pdf(np.linspace(labels.min(), labels.max()))",
        ]
        return kde_data, code
    except np.linalg.LinAlgError:
        return None, []
Ejemplo n.º 6
0
 def build_histogram_data(self, series):
     hist_kwargs = {"density": True} if self.density else {"bins": self.bins}
     hist_data, hist_labels = np.histogram(series, **hist_kwargs)
     hist_data = [json_float(h) for h in hist_data]
     return (
         dict(
             labels=[
                 "{0:.1f}".format(lbl) for lbl in hist_labels[1:]
             ],  # drop the first bin because of just a minimum
             data=hist_data,
         ),
         hist_labels,
     )
Ejemplo n.º 7
0
def load_describe(column_series, additional_aggs=None):
    """
    Helper function for grabbing the output from :meth:`pandas:pandas.Series.describe` in a JSON serializable format

    :param column_series: data to describe
    :type column_series: :class:`pandas:pandas.Series`
    :return: JSON serializable dictionary of the output from calling :meth:`pandas:pandas.Series.describe`
    """
    desc = column_series.describe().to_frame().T
    code = [
        "# main statistics",
        "stats = df['{col}'].describe().to_frame().T".format(col=column_series.name),
    ]
    if additional_aggs:
        for agg in additional_aggs:
            if agg == "mode":
                mode = column_series.mode().values
                desc["mode"] = np.nan if len(mode) > 1 else mode[0]
                code.append(
                    (
                        "# mode\n"
                        "mode = df['{col}'].mode().values\n"
                        "stats['mode'] = np.nan if len(mode) > 1 else mode[0]"
                    ).format(col=column_series.name)
                )
                continue
            desc[agg] = getattr(column_series, agg)()
            code.append(
                "# {agg}\nstats['{agg}'] = df['{col}'].{agg}()".format(
                    col=column_series.name, agg=agg
                )
            )
    desc_f_overrides = {
        "I": lambda f, i, c: f.add_int(i, c, as_string=True),
        "F": lambda f, i, c: f.add_float(i, c, precision=4, as_string=True),
    }
    desc_f = grid_formatter(
        grid_columns(desc), nan_display="nan", overrides=desc_f_overrides
    )
    desc = desc_f.format_dict(next(desc.itertuples(), None))
    if "count" in desc:
        # pandas always returns 'count' as a float and it adds useless decimal points
        desc["count"] = desc["count"].split(".")[0]
    desc["total_count"] = json_int(len(column_series), as_string=True)
    missing_ct = column_series.isnull().sum()
    desc["missing_pct"] = json_float((missing_ct / len(column_series) * 100).round(2))
    desc["missing_ct"] = json_int(missing_ct, as_string=True)
    return desc, code