Beispiel #1
0
    def build(self, parent):
        if parent.classifier == "D":
            parent.data.loc[:, parent.selected_col] = apply(
                parent.data[parent.selected_col], json_timestamp)
        kde_code = []
        if self.target is None:
            return_data, hist_labels = self.build_histogram_data(
                parent.data[parent.selected_col])
            kde, kde_code = build_kde(parent.data[parent.selected_col],
                                      hist_labels, parent.selected_col)
            if kde is not None:
                return_data["kde"] = kde
        else:
            return_data = {"targets": [], "labels": list(range(self.bins))}
            target_dtype = find_dtype(parent.data[self.target])
            target_formatter = find_dtype_formatter(target_dtype)
            for target, target_data in parent.data[[
                    self.target, parent.selected_col
            ]].groupby(self.target):
                target_data, _ = self.build_histogram_data(
                    target_data[parent.selected_col])
                target_data["target"] = target_formatter(target,
                                                         as_string=True)
                return_data["targets"].append(target_data)

        desc, desc_code = load_describe(parent.data[parent.selected_col])
        dtype_info = global_state.get_dtype_info(parent.data_id,
                                                 parent.selected_col)
        for p in ["skew", "kurt"]:
            if p in dtype_info:
                desc[p] = dtype_info[p]

        return_data["desc"] = desc
        return return_data, self._build_code(parent, kde_code, desc_code)
Beispiel #2
0
 def __init__(self, data_id, column, cfg):
     self.data_id = data_id
     self.column = column
     s = global_state.get_data(data_id)[column]
     dtype = find_dtype(s)
     self.classification = classify_type(dtype)
     self.cfg = cfg
     if self.cfg is not None:
         self.cfg = json.loads(self.cfg)
     if self.cfg['type'] == 'string':
         self.builder = StringFilter(column, self.classification, self.cfg)
     if self.cfg['type'] in ['int', 'float']:
         self.builder = NumericFilter(column, self.classification, self.cfg)
     if self.cfg['type'] == 'date':
         self.builder = DateFilter(column, self.classification, self.cfg)
Beispiel #3
0
 def __init__(self, data_id, column, cfg):
     self.data_id = data_id
     self.column = column
     s = global_state.get_data(data_id)[column]
     dtype = find_dtype(s)
     self.classification = classify_type(dtype)
     self.cfg = cfg
     if self.cfg is not None:
         self.cfg = json.loads(self.cfg)
     if self.cfg["type"] == "string":
         self.builder = StringFilter(column, self.classification, self.cfg)
     if self.cfg["type"] in ["int", "float"]:
         self.builder = NumericFilter(column, self.classification, self.cfg)
     if self.cfg["type"] == "date":
         self.builder = DateFilter(column, self.classification, self.cfg)
     if self.cfg["type"] == "outliers":
         self.builder = OutlierFilter(column, self.classification, self.cfg)
Beispiel #4
0
    def build(self, parent):
        if parent.classifier == "D":
            parent.data.loc[:, parent.selected_col] = apply(
                parent.data[parent.selected_col], json_timestamp
            )
        kde_code = []
        if self.target is None:
            return_data, hist_labels = self.build_histogram_data(
                parent.data[parent.selected_col]
            )
            kde, kde_code = build_kde(
                parent.data[parent.selected_col], hist_labels, parent.selected_col
            )
            if kde is not None:
                return_data["kde"] = kde
        else:
            bin_vals = pd.cut(parent.data[parent.selected_col], bins=self.bins)
            labels = ["{}".format(c) for c in bin_vals.dtype.categories]
            parent.data.loc[:, "bin"] = bin_vals.astype("str")
            return_data = {"targets": [], "labels": labels}
            target_dtype = find_dtype(parent.data[self.target])
            target_formatter = find_dtype_formatter(target_dtype)
            for target, target_data in parent.data[[self.target, "bin"]].groupby(
                self.target
            ):
                target_counts = target_data["bin"].value_counts()
                target_counts = [
                    int(tc) for tc in target_counts.reindex(labels, fill_value=0).values
                ]
                return_data["targets"].append(
                    dict(
                        target=target_formatter(target, as_string=True),
                        data=target_counts,
                    )
                )

        desc, desc_code = load_describe(parent.data[parent.selected_col])
        dtype_info = global_state.get_dtype_info(parent.data_id, parent.selected_col)
        for p in ["skew", "kurt"]:
            if p in dtype_info:
                desc[p] = dtype_info[p]

        return_data["desc"] = desc
        return return_data, self._build_code(parent, kde_code, desc_code)
Beispiel #5
0
    def __init__(self, data_id, req):
        self.data_id = data_id
        self.analysis_type = get_str_arg(req, "type")
        curr_settings = global_state.get_settings(data_id) or {}
        self.query = build_query(data_id, curr_settings.get("query"))
        data = load_filterable_data(data_id, req, query=self.query)
        self.selected_col = find_selected_column(
            data, get_str_arg(req, "col", "values")
        )
        self.data = data[~pd.isnull(data[self.selected_col])]
        self.dtype = find_dtype(self.data[self.selected_col])
        self.classifier = classify_type(self.dtype)
        self.code = build_code_export(
            data_id,
            imports="{}\n".format(
                "\n".join(
                    [
                        "import numpy as np",
                        "import pandas as pd",
                        "import plotly.graph_objs as go",
                    ]
                )
            ),
        )

        if self.analysis_type is None:
            self.analysis_type = (
                "histogram" if self.classifier in ["F", "I", "D"] else "value_counts"
            )

        if self.analysis_type == "geolocation":
            self.analysis = GeolocationAnalysis(req)
        elif self.analysis_type == "histogram":
            self.analysis = HistogramAnalysis(req)
        elif self.analysis_type == "categories":
            self.analysis = CategoryAnalysis(req)
        elif self.analysis_type == "value_counts":
            self.analysis = ValueCountAnalysis(req)
        elif self.analysis_type == "word_value_counts":
            self.analysis = WordValueCountAnalysis(req)
        elif self.analysis_type == "qq":
            self.analysis = QQAnalysis()
Beispiel #6
0
 def _build_val(col, val):
     if classify_type(find_dtype(data[col])) == "D":
         return json_date(convert_date_val_to_date(val))
     return val
Beispiel #7
0
def get_inner_replacement_value_as_str(val, series):
    if isinstance(val, string_types) and val.lower() == "nan":
        return "np.nan"
    if classify_type(find_dtype(series)) == "S":
        return "'{value}'".format(value=val)
    return val
Beispiel #8
0
def build_base_chart(raw_data,
                     x,
                     y,
                     group_col=None,
                     group_val=None,
                     agg=None,
                     allow_duplicates=False,
                     return_raw=False,
                     unlimited_data=False,
                     animate_by=None,
                     **kwargs):
    """
    Helper function to return data for 'chart-data' & 'correlations-ts' endpoints.  Will return a dictionary of
    dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum &
    maximum of all the series for the y-axis.  If there is only one series (no group_col specified) the only key in the
    dictionary of series data will be 'all' otherwise the keys will be the values of the groups.

    :param raw_data: dataframe to be used for chart
    :type raw_data: :class:`pandas:pandas.DataFrame`
    :param x: column to be used as x-axis of chart
    :type x: str
    :param y: column to be used as y-axis of chart
    :type y: list of strings
    :param group: comma-separated string of columns to group chart data by
    :type group: str, optional
    :param agg: points to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: str, optional
    :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots)
    :type allow_duplicates: bool, optional
    :return: dict
    """
    group_fmt_overrides = {
        "I": lambda v, as_string: json_int(v, as_string=as_string, fmt="{}")
    }
    data, code = retrieve_chart_data(raw_data,
                                     x,
                                     y,
                                     kwargs.get("z"),
                                     group_col,
                                     animate_by,
                                     group_val=group_val)
    x_col = str("x")
    if x is None:
        x = x_col
        data.loc[:, x_col] = range(1,
                                   len(data) +
                                   1)  # sequential integers: 1, 2, ..., N
    y_cols = make_list(y)
    z_col = kwargs.get("z")
    z_cols = make_list(z_col)
    sort_cols = y_cols if len(z_cols) else []
    if group_col is not None and len(group_col):
        main_group = group_col
        if animate_by is not None:
            main_group = [animate_by] + main_group
        sort_cols = main_group + [x] + sort_cols
        data = data.sort_values(sort_cols)
        code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
            cols="', '".join(sort_cols)))
        check_all_nan(data)
        data = data.rename(columns={x: x_col})
        code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                    x_col + "'})")
        if agg is not None:
            data, agg_code = build_agg_data(
                data,
                x_col,
                y_cols,
                kwargs,
                agg,
                z=z_col,
                group_col=group_col,
                animate_by=animate_by,
            )
            code += agg_code
        MAX_GROUPS = 30
        group_vals = data[group_col].drop_duplicates()
        if len(group_vals) > MAX_GROUPS:
            dtypes = get_dtypes(group_vals)
            group_fmts = {
                c: find_dtype_formatter(dtypes[c],
                                        overrides=group_fmt_overrides)
                for c in group_col
            }

            group_f, _ = build_formatters(group_vals)
            group_vals = group_f.format_lists(group_vals)
            group_vals = pd.DataFrame(group_vals, columns=group_col)
            msg = (
                "Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. "
                'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) '
                "are listed below:\n\n{}").format(
                    ", ".join(group_col), MAX_GROUPS,
                    group_vals.to_string(index=False))
            raise ChartBuildingError(msg, group_vals.to_string(index=False))

        data = data.dropna()
        if return_raw:
            return data.rename(columns={x_col: x})
        code.append("chart_data = chart_data.dropna()")
        data_f, range_f = build_formatters(data)
        ret_data = dict(
            data={},
            min={
                col: fmt(data[col].min(), None)
                for _, col, fmt in range_f.fmts
                if col in [x_col] + y_cols + z_cols
            },
            max={
                col: fmt(data[col].max(), None)
                for _, col, fmt in range_f.fmts
                if col in [x_col] + y_cols + z_cols
            },
        )

        dtypes = get_dtypes(data)
        group_fmts = {
            c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides)
            for c in group_col
        }

        def _load_groups(df):
            for group_val, grp in df.groupby(group_col):

                def _group_filter():
                    for gv, gc in zip(make_list(group_val), group_col):
                        classifier = classify_type(dtypes[gc])
                        yield group_filter_handler(
                            gc, group_fmts[gc](gv, as_string=True), classifier)

                group_filter = " and ".join(list(_group_filter()))
                yield group_filter, data_f.format_lists(grp)

        if animate_by is not None:
            frame_fmt = find_dtype_formatter(dtypes[animate_by],
                                             overrides=group_fmt_overrides)
            ret_data["frames"] = []
            for frame_key, frame in data.sort_values(animate_by).groupby(
                    animate_by):
                ret_data["frames"].append(
                    dict(
                        data=dict(_load_groups(frame)),
                        name=frame_fmt(frame_key, as_string=True),
                    ))
            ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"])
        else:
            ret_data["data"] = dict(_load_groups(data))
        return ret_data, code
    main_group = [x]
    if animate_by is not None:
        main_group = [animate_by] + main_group
    sort_cols = main_group + sort_cols
    data = data.sort_values(sort_cols)
    code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
        cols="', '".join(sort_cols)))
    check_all_nan(data)
    y_cols = [str(y_col) for y_col in y_cols]
    data = data[main_group + y_cols + z_cols]

    data = data.rename(columns={x: x_col})
    main_group = [x_col if c == x else c for c in main_group]
    code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                x_col + "'})")

    if agg is not None:
        data, agg_code = build_agg_data(data,
                                        x_col,
                                        y_cols,
                                        kwargs,
                                        agg,
                                        z=z_col,
                                        animate_by=animate_by)
        code += agg_code
    data = data.dropna()
    if return_raw:
        return data.rename(columns={x_col: x})
    code.append("chart_data = chart_data.dropna()")

    dupe_cols = main_group + (y_cols if len(z_cols) else [])
    check_exceptions(
        data[dupe_cols].rename(columns={x_col: x}),
        allow_duplicates or agg in ["raw", "drop_duplicates"],
        unlimited_data=unlimited_data,
        data_limit=40000 if len(z_cols) or animate_by is not None else 15000,
    )
    data_f, range_f = build_formatters(data)

    ret_data = dict(
        min={
            col: fmt(data[col].min(), None)
            for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols
        },
        max={
            col: fmt(data[col].max(), None)
            for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols
        },
    )
    if animate_by is not None:
        frame_fmt = find_dtype_formatter(find_dtype(data[animate_by]),
                                         overrides=group_fmt_overrides)
        ret_data["frames"] = []
        for frame_key, frame in data.sort_values(animate_by).groupby(
                animate_by):
            ret_data["frames"].append(
                dict(
                    data={str("all"): data_f.format_lists(frame)},
                    name=frame_fmt(frame_key, as_string=True),
                ))
        ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"])
    else:
        ret_data["data"] = {str("all"): data_f.format_lists(data)}
    return ret_data, code
Beispiel #9
0
def build_base_chart(raw_data,
                     x,
                     y,
                     group_col=None,
                     group_type=None,
                     group_val=None,
                     bins_val=None,
                     bin_type=None,
                     agg=None,
                     extended_aggregation=[],
                     allow_duplicates=False,
                     return_raw=False,
                     unlimited_data=False,
                     animate_by=None,
                     cleaners=[],
                     **kwargs):
    """
    Helper function to return data for 'chart-data' & 'correlations-ts' endpoints.  Will return a dictionary of
    dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum &
    maximum of all the series for the y-axis.  If there is only one series (no group_col specified) the only key in the
    dictionary of series data will be 'all' otherwise the keys will be the values of the groups.

    :param raw_data: dataframe to be used for chart
    :type raw_data: :class:`pandas:pandas.DataFrame`
    :param x: column to be used as x-axis of chart
    :type x: str
    :param y: column to be used as y-axis of chart
    :type y: list of strings
    :param group: comma-separated string of columns to group chart data by
    :type group: str, optional
    :param agg: points to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: str, optional
    :param extended_aggregation: list of configurations that point to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: list, optional
    :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots)
    :type allow_duplicates: bool, optional
    :return: dict
    """
    group_fmt_overrides = {
        "I": lambda v, as_string: json_int(v, as_string=as_string, fmt="{}")
    }
    data, code = retrieve_chart_data(raw_data,
                                     x,
                                     y,
                                     kwargs.get("z"),
                                     group_col,
                                     animate_by,
                                     group_val=group_val)
    cleaners = cleaners or []
    if len(cleaners):
        for col in data.columns:
            if classify_type(find_dtype(data[col])) == "S":
                code.append("s = chart_data['{}']".format(col))
                cleaned_col, cleaned_code = handle_cleaners(
                    data[col], ",".join(cleaners))
                data.loc[:, col] = cleaned_col
                code += cleaned_code
                code.append("chart_data.loc[:, '{}'] = s".format(col))

    x_col = str("x")
    if x is None:
        x = x_col
        data.loc[:, x_col] = range(1,
                                   len(data) +
                                   1)  # sequential integers: 1, 2, ..., N
    y_cols = make_list(y)

    z_col = kwargs.get("z")
    z_cols = make_list(z_col)
    y_cols = [str(col) for col in y_cols]
    is_z = len(z_cols) > 0
    y_group_cols = y_cols if is_z else []
    sort_cols = y_group_cols
    final_cols = y_cols + z_cols
    if group_col is not None and len(group_col):
        for col in make_list(group_col):
            classifier = classify_type(find_dtype(data[col]))
            if classifier == "F" or (classifier == "I"
                                     and group_type == "bins"):
                if bin_type == "width":
                    data.loc[:, col] = pd.qcut(data[col],
                                               q=bins_val,
                                               duplicates="drop").astype("str")
                    code.append((
                        "chart_data.loc[:, '{col}'] = pd.qcut(chart_data['{col}'], q={bins}, duplicates=\"drop\")"
                    ).format(col=col, bins=bins_val))
                else:
                    bins_data = data[col].dropna()
                    npt = len(bins_data)
                    equal_freq_bins = np.interp(
                        np.linspace(0, npt, bins_val + 1),
                        np.arange(npt),
                        np.sort(bins_data),
                    )
                    data.loc[:, col] = pd.cut(data[col],
                                              bins=equal_freq_bins,
                                              duplicates="drop").astype("str")
                    code.append((
                        "bins_data = data['{col}'].dropna()\n"
                        "npt = len(bins_data)\n"
                        "equal_freq_bins = np.interp(np.linspace(0, npt, {bins}), np.arange(npt), "
                        "np.sort(bins_data))\n"
                        "chart_data.loc[:, '{col}'] = pd.cut(chart_data['{col}'], bins=equal_freq_bins, "
                        'duplicates="drop")').format(col=col,
                                                     bins=bins_val + 1))

        main_group = group_col
        if animate_by is not None:
            main_group = [animate_by] + main_group
        sort_cols = main_group + [x] + sort_cols
        data = data.sort_values(sort_cols)
        code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
            cols="', '".join(sort_cols)))
        check_all_nan(data)
        data = data.rename(columns={x: x_col})
        code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                    x_col + "'})")

        if agg is not None or len(extended_aggregation):
            data, agg_code, final_cols = build_agg_data(
                data,
                x_col,
                y_cols,
                kwargs,
                agg,
                z=z_col,
                group_col=group_col,
                animate_by=animate_by,
                extended_aggregation=extended_aggregation,
            )
            code += agg_code

        group_vals = data[group_col].drop_duplicates()
        if len(group_vals) > MAX_GROUPS:
            dtypes = get_dtypes(group_vals)
            group_fmts = {
                c: find_dtype_formatter(dtypes[c],
                                        overrides=group_fmt_overrides)
                for c in group_col
            }

            group_f, _ = build_formatters(group_vals)
            group_vals = group_f.format_lists(group_vals)
            group_vals = pd.DataFrame(group_vals, columns=group_col)
            msg = (
                "Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. "
                'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) '
                "are listed below:\n\n{}").format(
                    ", ".join(group_col), MAX_GROUPS,
                    group_vals.to_string(index=False))
            raise ChartBuildingError(msg, group_vals.to_string(index=False))

        data = data.dropna()
        if return_raw:
            return data.rename(columns={x_col: x})
        code.append("chart_data = chart_data.dropna()")
        data_f, range_f = build_formatters(data)
        ret_data = dict(
            data={},
            min={
                col: fmt(data[col].min(), None)
                for _, col, fmt in range_f.fmts if col in [x_col] + final_cols
            },
            max={
                col: fmt(data[col].max(), None)
                for _, col, fmt in range_f.fmts if col in [x_col] + final_cols
            },
        )

        dtypes = get_dtypes(data)
        group_fmts = {
            c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides)
            for c in group_col
        }

        def _load_groups(df):
            for group_val, grp in df.groupby(group_col):

                def _group_filter():
                    for gv, gc in zip(make_list(group_val), group_col):
                        classifier = classify_type(dtypes[gc])
                        yield group_filter_handler(
                            gc, group_fmts[gc](gv, as_string=True), classifier)

                final_group_filter, final_group_label = [], []
                for gf, gl in _group_filter():
                    final_group_filter.append(gf)
                    final_group_label.append(gl)
                group_filter = " and ".join(final_group_filter)
                group_label = "({})".format(", ".join(final_group_label))
                data = data_f.format_lists(grp)
                data["_filter_"] = group_filter
                yield group_label, data

        if animate_by is not None:
            frame_fmt = find_dtype_formatter(dtypes[animate_by],
                                             overrides=group_fmt_overrides)
            ret_data["frames"] = []
            for frame_key, frame in data.sort_values(animate_by).groupby(
                    animate_by):
                ret_data["frames"].append(
                    dict(
                        data=dict(_load_groups(frame)),
                        name=frame_fmt(frame_key, as_string=True),
                    ))
            ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"])
        else:
            ret_data["data"] = dict(_load_groups(data))
        return ret_data, code

    main_group = [x]
    if animate_by is not None:
        main_group = [animate_by] + main_group
    sort_cols = main_group + sort_cols
    data = data.sort_values(sort_cols)
    code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
        cols="', '".join(sort_cols)))
    check_all_nan(data)
    data = data[main_group + final_cols]

    data = data.rename(columns={x: x_col})
    main_group = [x_col if c == x else c for c in main_group]
    code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                x_col + "'})")

    # convert booleans into integers for aggregation
    for col in z_cols or y_cols:
        classifier = classify_type(find_dtype(data[col]))
        if classifier == "B":
            data.loc[:, col] = data[col].astype("int")

    if agg is not None or len(extended_aggregation):
        data, agg_code, final_cols = build_agg_data(
            data,
            x_col,
            y_cols,
            kwargs,
            agg,
            z=z_col,
            animate_by=animate_by,
            extended_aggregation=extended_aggregation,
        )
        code += agg_code
    data = data.dropna()

    if return_raw:
        return data.rename(columns={x_col: x})
    code.append("chart_data = chart_data.dropna()")

    dupe_cols = main_group + y_group_cols
    data_limit = global_state.get_chart_settings(
    )["3d_points" if is_z or animate_by is not None else "scatter_points"]
    check_exceptions(
        data[dupe_cols].rename(columns={x_col: x}),
        allow_duplicates or agg in ["raw", "drop_duplicates"],
        unlimited_data=unlimited_data,
        data_limit=data_limit,
    )
    data_f, range_f = build_formatters(data)

    ret_data = dict(
        min={
            col: fmt(data[col].min(), None)
            for _, col, fmt in range_f.fmts
            if col in [x_col] + y_group_cols + final_cols
        },
        max={
            col: fmt(data[col].max(), None)
            for _, col, fmt in range_f.fmts
            if col in [x_col] + y_group_cols + final_cols
        },
    )
    if animate_by is not None:
        frame_fmt = find_dtype_formatter(find_dtype(data[animate_by]),
                                         overrides=group_fmt_overrides)
        ret_data["frames"] = []
        for frame_key, frame in data.sort_values(animate_by).groupby(
                animate_by):
            ret_data["frames"].append(
                dict(
                    data={str("all"): data_f.format_lists(frame)},
                    name=frame_fmt(frame_key, as_string=True),
                ))
        ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"])
    else:
        ret_data["data"] = {str("all"): data_f.format_lists(data)}
    return ret_data, code