Beispiel #1
0
    def input_data(_ts, chart_type, x, y_multi, y_single, z, group, agg, window, rolling_comp, pathname, query):
        """
        dash callback for maintaining chart input state and column-based dropdown options.  This will guard against
        users selecting the same column for multiple axes.
        """
        y_val = make_list(y_single if chart_type in ZAXIS_CHARTS else y_multi)
        inputs = dict(query=query, chart_type=chart_type, x=x, y=y_val, z=z, group=group, agg=agg, window=window,
                      rolling_comp=rolling_comp)
        data_id = get_data_id(pathname)
        cols = DATA[data_id].columns
        dtypes = get_dtypes(DATA[data_id])

        def build_selections(*args):
            return flatten_lists([[] if a is None else make_list(a) for a in args])

        def build_cols():
            for c in cols:
                if classify_type(dtypes[c]) == 'D':
                    for freq in FREQS:
                        if freq in FREQ_LABELS:
                            yield '{}|{}'.format(c, freq), '{} ({})'.format(c, FREQ_LABELS[freq])
                        else:
                            yield c, c
                else:
                    yield c, c

        col_opts = list(build_cols())
        group_val, z_val = (None, z) if chart_type in ZAXIS_CHARTS else (group, None)
        x_options = [build_option(c, l) for c, l in col_opts if c not in build_selections(y_val, z_val, group_val)]
        y_filter = build_selections(x, group_val, z_val)
        y_multi_options = [build_option(c, l) for c, l in col_opts if c not in y_filter]
        y_single_options = [build_option(c, l) for c, l in col_opts if c not in y_filter]
        z_options = [build_option(c) for c in cols if c not in build_selections(x, y_val, group_val)]
        group_options = [build_option(c, l) for c, l in col_opts if c not in build_selections(x, y_val, z_val)]
        return inputs, x_options, y_single_options, y_multi_options, z_options, group_options
Beispiel #2
0
def describe(column):
    """
    Flask route which returns standard details about column data using :meth:`pandas:pandas.DataFrame.describe` to
    the front-end as JSON

    :param column: required dash separated string "START-END" stating a range of row indexes to be returned
                   to the screen
    :return: JSON {
        describe: object representing output from :meth:`pandas:pandas.Series.describe`,
        unique_data: array of unique values when data has <= 100 unique values
        success: True/False
    }

    """
    try:
        data = DATA[get_port()]
        desc = load_describe(data[column])
        return_data = dict(describe=desc, success=True)
        uniq_vals = data[column].unique()
        if 'unique' not in return_data['describe']:
            return_data['describe']['unique'] = json_int(len(uniq_vals),
                                                         as_string=True)
        if len(uniq_vals) <= 100:
            uniq_f = find_dtype_formatter(get_dtypes(data)[column])
            return_data['uniques'] = [
                uniq_f(u, nan_display='N/A') for u in uniq_vals
            ]
        return jsonify(return_data)
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
Beispiel #3
0
def build_input_options(df, **inputs):
    """
    Builds dropdown options for (X, Y, Z, Group, Barsort & Y-Axis Ranges) with filtering based on currently selected
    values for the following inputs: x, y, z, group.
    """
    [chart_type, x, y, z,
     group] = [inputs.get(p) for p in ['chart_type', 'x', 'y', 'z', 'group']]
    col_opts = list(build_cols(df.columns, get_dtypes(df)))
    group_val, z_val = (None, z) if chart_type in ZAXIS_CHARTS else (group,
                                                                     None)
    x_options = [
        build_option(c, l) for c, l in col_opts
        if c not in build_selections(y, z_val, group_val)
    ]
    y_filter = build_selections(x, group_val, z_val)
    y_multi_options = [
        build_option(c, l) for c, l in col_opts if c not in y_filter
    ]
    y_single_options = [
        build_option(c, l) for c, l in col_opts if c not in y_filter
    ]
    z_options = [
        build_option(c) for c in df.columns
        if c not in build_selections(x, y, group_val)
    ]
    group_options = [
        build_option(c, l) for c, l in col_opts
        if c not in build_selections(x, y, z_val)
    ]
    barsort_options = [build_option(o) for o in build_selections(x, y)]
    yaxis_options = [build_option(y2) for y2 in y or []]
    return x_options, y_multi_options, y_single_options, z_options, group_options, barsort_options, yaxis_options
Beispiel #4
0
def date_freq_handler(df):
    """
    This returns a column definition handler which returns a series based on the specs from the front-end.
    Column definitions can be a column name 'Col1' or a column name with a frequency 'Col1|M' for
    columns which are of type datetime.

    :Example:
        Col1 -> returns series for Col1
        Col1|M -> returns series for Col1 in monthly format with name 'Col1|M'

    :param df: dataframe whose data needs to be checked
    :type df: :class:`pandas:pandas.DataFrame`
    :return: handler function
    :rtype: func
    """
    dtypes = get_dtypes(df)
    orig_idx = df.index

    def _handler(col_def):
        col_def_segs = col_def.split('|')
        if len(col_def_segs) > 1 and classify_type(
                dtypes[col_def_segs[0]]) == 'D':
            col, freq = col_def_segs
            if freq == 'WD':
                freq_grp = df.set_index(col).index.dayofweek.values
            elif freq == 'H2':
                freq_grp = df.set_index(col).index.hour.values
            else:
                freq_grp = df.set_index(col).index.to_period(
                    freq).to_timestamp(how='end').values
            freq_grp = pd.Series(freq_grp, index=orig_idx, name=col_def)
            return freq_grp
        return df[col_def]

    return _handler
Beispiel #5
0
def animate_styles(df, **inputs):
    chart_type, agg, cpg = (inputs.get(p) for p in ['chart_type', 'agg', 'cpg'])
    opts = []
    if cpg or agg in ['pctsum', 'pctct']:
        return dict(display='none'), dict(display='none'), opts
    if chart_type in ANIMATION_CHARTS:
        return dict(display='block'), dict(display='none'), opts
    if chart_type in ANIMATE_BY_CHARTS:
        opts = [build_option(v, l) for v, l in build_cols(df.columns, get_dtypes(df))]
    if len(opts):
        return dict(display='none'), dict(display='block'), opts
    return dict(display='none'), dict(display='none'), []
Beispiel #6
0
def build_group_inputs_filter(df, group_inputs):
    dtypes = get_dtypes(df)

    def _group_filter(group_val):
        for gc, gv in group_val.items():
            classifier = classify_type(dtypes[gc])
            yield group_filter_handler(gc, gv, classifier)

    def _full_filter():
        for group_val in group_inputs:
            group_filter = " and ".join(list(_group_filter(group_val)))
            yield group_filter

    filters = list(_full_filter())
    return "({})".format(") or (".join(filters))
Beispiel #7
0
def animate_by_style(df, **inputs):
    chart_type, cpg = (inputs.get(p) for p in ['chart_type', 'cpg'])
    if cpg:
        return dict(display='none'), []
    opts = []
    if chart_type in ANIMATE_BY_CHARTS:
        opts = [
            build_option(v, l)
            for v, l in build_cols(df.columns, get_dtypes(df))
        ]
    if chart_type in ANIMATION_CHARTS:
        opts = [build_option('chart_values', 'Values in Chart')]
    if len(opts):
        return dict(display='block'), opts
    return dict(display='none'), []
Beispiel #8
0
def describe(data_id, column):
    """
    :class:`flask:flask.Flask` route which returns standard details about column data using
    :meth:`pandas:pandas.DataFrame.describe` to the front-end as JSON

    :param data_id: integer string identifier for a D-Tale process's data
    :type data_id: str
    :param column: required dash separated string "START-END" stating a range of row indexes to be returned
                   to the screen
    :return: JSON {
        describe: object representing output from :meth:`pandas:pandas.Series.describe`,
        unique_data: array of unique values when data has <= 100 unique values
        success: True/False
    }

    """
    try:
        data = DATA[data_id][[column]]
        additional_aggs = None
        dtype = next((dtype_info['dtype'] for dtype_info in DTYPES[data_id]
                      if dtype_info['name'] == column), None)
        if classify_type(dtype) in ['I', 'F']:
            additional_aggs = [
                'sum', 'median', 'mode', 'var', 'sem', 'skew', 'kurt'
            ]
        desc = load_describe(data[column], additional_aggs=additional_aggs)
        return_data = dict(describe=desc, success=True)
        uniq_vals = data[column].unique()
        if 'unique' not in return_data['describe']:
            return_data['describe']['unique'] = json_int(len(uniq_vals),
                                                         as_string=True)
        uniq_f = find_dtype_formatter(get_dtypes(data)[column])
        if len(uniq_vals) <= 100:
            return_data['uniques'] = dict(data=[uniq_f(u) for u in uniq_vals],
                                          top=False)
        else:  # get top 100 most common values
            uniq_vals = data[column].value_counts().sort_values(
                ascending=False).head(100).index.values
            return_data['uniques'] = dict(data=[uniq_f(u) for u in uniq_vals],
                                          top=True)

        return jsonify(return_data)
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
Beispiel #9
0
def date_freq_handler(df):
    """
    This returns a column definition handler which returns a series based on the specs from the front-end.
    Column definitions can be a column name 'Col1' or a column name with a frequency 'Col1|M' for
    columns which are of type datetime.

    :Example:
        Col1 -> returns series for Col1
        Col1|M -> returns series for Col1 in monthly format with name 'Col1|M'

    :param df: dataframe whose data needs to be checked
    :type df: :class:`pandas:pandas.DataFrame`
    :return: handler function
    :rtype: func
    """
    dtypes = get_dtypes(df)
    orig_idx = df.index

    def _handler(col_def):
        col_def_segs = col_def.split("|")
        if len(col_def_segs) > 1 and classify_type(dtypes[col_def_segs[0]]) == "D":
            col, freq = col_def_segs
            if freq == "WD":
                code = "df.set_index('{col}').index.dayofweek.values"
                freq_grp = df.set_index(col).index.dayofweek.values
            elif freq == "H2":
                code = "df.set_index('{col}').index.hour.values"
                freq_grp = df.set_index(col).index.hour.values
            else:
                code = "df.set_index('{col}').index.to_period('{freq}').to_timestamp(how='end').values"
                freq_grp = (
                    df.set_index(col)
                    .index.to_period(freq)
                    .to_timestamp(how="end")
                    .values
                )
            code = "\tpd.Series(" + code + ", index=df.index, name='{col_def}'),"
            freq_grp = pd.Series(freq_grp, index=orig_idx, name=col_def)
            return freq_grp, code.format(col=col, freq=freq, col_def=col_def)
        else:
            return df[col_def], "\tdf['{col_def}'],".format(col_def=col_def)

    return _handler
Beispiel #10
0
def retrieve_chart_data(df, *args, **kwargs):
    """
    Retrieves data from a dataframe for x, y, z & group inputs complete with date frequency
    formatting (:meth:`dtale.charts.utils.date_freq_handler`) if specified

    :param df: dataframe that contains data for chart
    :type df: :class:`pandas:pandas.DataFrame`
    :param args: columns to use
    :type args: iterable of str
    :return: dataframe of data required for chart construction
    :rtype: :class:`pandas:pandas.DataFrame`
    """
    freq_handler = date_freq_handler(df)
    cols = flatten_lists([make_list(a) for a in args])
    all_code = []
    all_data = []
    for col in cols:
        if col is not None:
            s, code = freq_handler(col)
            all_data.append(s)
            if code is not None:
                all_code.append(code)
    all_data = pd.concat(all_data, axis=1)
    all_code = ["chart_data = pd.concat(["] + all_code + ["], axis=1)"]
    if len(make_list(kwargs.get('group_val'))):
        dtypes = get_dtypes(all_data)

        def _group_filter(group_val):
            for gc, gv in group_val.items():
                classifier = classify_type(dtypes[gc])
                yield group_filter_handler(gc, gv, classifier)

        def _full_filter():
            for group_val in kwargs['group_val']:
                group_filter = ' and '.join(list(_group_filter(group_val)))
                yield group_filter

        filters = list(_full_filter())
        filters = '({})'.format(') or ('.join(filters))
        all_data = all_data.query(filters)
        all_code.append('chart_data = chart_data.query({})'.format(filters))
    return all_data, all_code
Beispiel #11
0
def build_map_options(df,
                      type='choropleth',
                      loc=None,
                      lat=None,
                      lon=None,
                      map_val=None):
    dtypes = get_dtypes(df)
    cols = sorted(dtypes.keys())
    float_cols, str_cols, num_cols = [], [], []
    for c in cols:
        dtype = dtypes[c]
        classification = classify_type(dtype)
        if classification == 'S':
            str_cols.append(c)
            continue
        if classification in ['F', 'I']:
            num_cols.append(c)
            if classification == 'F':
                float_cols.append(c)

    lat_options = [
        build_option(c) for c in float_cols
        if c not in build_selections(lon, map_val)
    ]
    lon_options = [
        build_option(c) for c in float_cols
        if c not in build_selections(lat, map_val)
    ]
    loc_options = [
        build_option(c) for c in str_cols if c not in build_selections(map_val)
    ]

    if type == 'choropleth':
        val_options = [
            build_option(c) for c in num_cols if c not in build_selections(loc)
        ]
    else:
        val_options = [
            build_option(c) for c in num_cols
            if c not in build_selections(lon, lat)
        ]
    return loc_options, lat_options, lon_options, val_options
Beispiel #12
0
def build_dtypes_state(data):
    """
    Helper function to build globally managed state pertaining to a D-Tale instances columns & data types

    :param data: dataframe to build data type information for
    :type data: :class:`pandas:pandas.DataFrame`
    :return: a list of dictionaries containing column names, indexes and data types
    """
    dtypes = get_dtypes(data)
    mins = data.min().to_dict()
    maxs = data.max().to_dict()

    def _format_dtype(col_index, col):
        dtype = dtypes[col]
        dtype_data = dict(name=col, dtype=dtype, index=col_index)
        if classify_type(dtype) == 'F':  # floats
            dtype_data['min'] = mins[col]
            dtype_data['max'] = maxs[col]
        return dtype_data

    return [_format_dtype(i, c) for i, c in enumerate(data.columns)]
Beispiel #13
0
def build_dtypes_state(data):
    """
    Helper function to build globally managed state pertaining to a D-Tale instances columns & data types

    :param data: dataframe to build data type information for
    :type data: :class:`pandas:pandas.DataFrame`
    :return: a list of dictionaries containing column names, indexes and data types
    """
    dtypes = get_dtypes(data)
    ranges = data.agg([min, max]).to_dict()

    def _format_dtype(col_index, col):
        dtype = dtypes[col]
        dtype_data = dict(name=col, dtype=dtype, index=col_index)
        if classify_type(dtype) == 'F' and not data[col].isnull().all():  # floats
            col_ranges = ranges[col]
            if not any((np.isnan(v) or np.isinf(v) for v in col_ranges.values())):
                dtype_data = dict_merge(ranges[col], dtype_data)
        return dtype_data

    return [_format_dtype(i, c) for i, c in enumerate(data.columns)]
Beispiel #14
0
def build_group_inputs_filter(df, group_inputs):
    dtypes = get_dtypes(df)

    def _group_filter(group_val):
        for gc, gv in group_val.items():
            classifier = classify_type(dtypes[gc])
            yield group_filter_handler(gc, gv, classifier)

    def _full_filter():
        for group_val in group_inputs:
            filter_vals, label_vals = [], []
            for fv, lv in _group_filter(group_val):
                filter_vals.append(fv)
                label_vals.append(lv)
            yield " and ".join(filter_vals), ", ".join(label_vals)

    full_filters, full_labels = [], []
    for ff, fl in _full_filter():
        full_filters.append(ff)
        full_labels.append(fl)
    return ("({})".format(") or (".join(full_filters)), ", ".join(full_labels))
Beispiel #15
0
 def build_x_dropdown(is_open, pathname, inputs, chart_inputs, yaxis_data, map_data):
     if not is_open:
         raise PreventUpdate
     df = global_state.get_data(get_data_id(pathname))
     all_inputs = combine_inputs(
         dash_app, inputs, chart_inputs, yaxis_data, map_data
     )
     chart_type, x, y, z, group, map_val, animate_by = (
         all_inputs.get(p)
         for p in ["chart_type", "x", "y", "z", "group", "map_val", "animate_by"]
     )
     if chart_type == "maps":
         if all_inputs.get("map_type") == "choropleth":
             x = all_inputs["loc"]
         else:
             x = "lat_lon"
         x_options = build_selections(map_val, animate_by)
     else:
         x_options = build_selections(z or y, animate_by)
     col_opts = list(build_cols(df.columns, get_dtypes(df)))
     x_options = [build_option(c, l) for c, l in col_opts if c not in x_options]
     return x_options, x
Beispiel #16
0
def build_base_chart(raw_data,
                     x,
                     y,
                     group_col=None,
                     group_val=None,
                     agg=None,
                     allow_duplicates=False,
                     return_raw=False,
                     unlimited_data=False,
                     animate_by=None,
                     **kwargs):
    """
    Helper function to return data for 'chart-data' & 'correlations-ts' endpoints.  Will return a dictionary of
    dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum &
    maximum of all the series for the y-axis.  If there is only one series (no group_col specified) the only key in the
    dictionary of series data will be 'all' otherwise the keys will be the values of the groups.

    :param raw_data: dataframe to be used for chart
    :type raw_data: :class:`pandas:pandas.DataFrame`
    :param x: column to be used as x-axis of chart
    :type x: str
    :param y: column to be used as y-axis of chart
    :type y: list of strings
    :param group: comma-separated string of columns to group chart data by
    :type group: str, optional
    :param agg: points to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: str, optional
    :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots)
    :type allow_duplicates: bool, optional
    :return: dict
    """
    group_fmt_overrides = {
        "I": lambda v, as_string: json_int(v, as_string=as_string, fmt="{}")
    }
    data, code = retrieve_chart_data(raw_data,
                                     x,
                                     y,
                                     kwargs.get("z"),
                                     group_col,
                                     animate_by,
                                     group_val=group_val)
    x_col = str("x")
    if x is None:
        x = x_col
        data.loc[:, x_col] = range(1,
                                   len(data) +
                                   1)  # sequential integers: 1, 2, ..., N
    y_cols = make_list(y)
    z_col = kwargs.get("z")
    z_cols = make_list(z_col)
    sort_cols = y_cols if len(z_cols) else []
    if group_col is not None and len(group_col):
        main_group = group_col
        if animate_by is not None:
            main_group = [animate_by] + main_group
        sort_cols = main_group + [x] + sort_cols
        data = data.sort_values(sort_cols)
        code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
            cols="', '".join(sort_cols)))
        check_all_nan(data)
        data = data.rename(columns={x: x_col})
        code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                    x_col + "'})")
        if agg is not None:
            data, agg_code = build_agg_data(
                data,
                x_col,
                y_cols,
                kwargs,
                agg,
                z=z_col,
                group_col=group_col,
                animate_by=animate_by,
            )
            code += agg_code
        MAX_GROUPS = 30
        group_vals = data[group_col].drop_duplicates()
        if len(group_vals) > MAX_GROUPS:
            dtypes = get_dtypes(group_vals)
            group_fmts = {
                c: find_dtype_formatter(dtypes[c],
                                        overrides=group_fmt_overrides)
                for c in group_col
            }

            group_f, _ = build_formatters(group_vals)
            group_vals = group_f.format_lists(group_vals)
            group_vals = pd.DataFrame(group_vals, columns=group_col)
            msg = (
                "Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. "
                'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) '
                "are listed below:\n\n{}").format(
                    ", ".join(group_col), MAX_GROUPS,
                    group_vals.to_string(index=False))
            raise ChartBuildingError(msg, group_vals.to_string(index=False))

        data = data.dropna()
        if return_raw:
            return data.rename(columns={x_col: x})
        code.append("chart_data = chart_data.dropna()")
        data_f, range_f = build_formatters(data)
        ret_data = dict(
            data={},
            min={
                col: fmt(data[col].min(), None)
                for _, col, fmt in range_f.fmts
                if col in [x_col] + y_cols + z_cols
            },
            max={
                col: fmt(data[col].max(), None)
                for _, col, fmt in range_f.fmts
                if col in [x_col] + y_cols + z_cols
            },
        )

        dtypes = get_dtypes(data)
        group_fmts = {
            c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides)
            for c in group_col
        }

        def _load_groups(df):
            for group_val, grp in df.groupby(group_col):

                def _group_filter():
                    for gv, gc in zip(make_list(group_val), group_col):
                        classifier = classify_type(dtypes[gc])
                        yield group_filter_handler(
                            gc, group_fmts[gc](gv, as_string=True), classifier)

                group_filter = " and ".join(list(_group_filter()))
                yield group_filter, data_f.format_lists(grp)

        if animate_by is not None:
            frame_fmt = find_dtype_formatter(dtypes[animate_by],
                                             overrides=group_fmt_overrides)
            ret_data["frames"] = []
            for frame_key, frame in data.sort_values(animate_by).groupby(
                    animate_by):
                ret_data["frames"].append(
                    dict(
                        data=dict(_load_groups(frame)),
                        name=frame_fmt(frame_key, as_string=True),
                    ))
            ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"])
        else:
            ret_data["data"] = dict(_load_groups(data))
        return ret_data, code
    main_group = [x]
    if animate_by is not None:
        main_group = [animate_by] + main_group
    sort_cols = main_group + sort_cols
    data = data.sort_values(sort_cols)
    code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
        cols="', '".join(sort_cols)))
    check_all_nan(data)
    y_cols = [str(y_col) for y_col in y_cols]
    data = data[main_group + y_cols + z_cols]

    data = data.rename(columns={x: x_col})
    main_group = [x_col if c == x else c for c in main_group]
    code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                x_col + "'})")

    if agg is not None:
        data, agg_code = build_agg_data(data,
                                        x_col,
                                        y_cols,
                                        kwargs,
                                        agg,
                                        z=z_col,
                                        animate_by=animate_by)
        code += agg_code
    data = data.dropna()
    if return_raw:
        return data.rename(columns={x_col: x})
    code.append("chart_data = chart_data.dropna()")

    dupe_cols = main_group + (y_cols if len(z_cols) else [])
    check_exceptions(
        data[dupe_cols].rename(columns={x_col: x}),
        allow_duplicates or agg in ["raw", "drop_duplicates"],
        unlimited_data=unlimited_data,
        data_limit=40000 if len(z_cols) or animate_by is not None else 15000,
    )
    data_f, range_f = build_formatters(data)

    ret_data = dict(
        min={
            col: fmt(data[col].min(), None)
            for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols
        },
        max={
            col: fmt(data[col].max(), None)
            for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols
        },
    )
    if animate_by is not None:
        frame_fmt = find_dtype_formatter(find_dtype(data[animate_by]),
                                         overrides=group_fmt_overrides)
        ret_data["frames"] = []
        for frame_key, frame in data.sort_values(animate_by).groupby(
                animate_by):
            ret_data["frames"].append(
                dict(
                    data={str("all"): data_f.format_lists(frame)},
                    name=frame_fmt(frame_key, as_string=True),
                ))
        ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"])
    else:
        ret_data["data"] = {str("all"): data_f.format_lists(data)}
    return ret_data, code
Beispiel #17
0
def build_base_chart(raw_data,
                     x,
                     y,
                     group_col=None,
                     group_val=None,
                     agg=None,
                     allow_duplicates=False,
                     return_raw=False,
                     unlimited_data=False,
                     **kwargs):
    """
    Helper function to return data for 'chart-data' & 'correlations-ts' endpoints.  Will return a dictionary of
    dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum &
    maximum of all the series for the y-axis.  If there is only one series (no group_col specified) the only key in the
    dictionary of series data will be 'all' otherwise the keys will be the values of the groups.

    :param raw_data: dataframe to be used for chart
    :type raw_data: :class:`pandas:pandas.DataFrame`
    :param x: column to be used as x-axis of chart
    :type x: str
    :param y: column to be used as y-axis of chart
    :type y: list of strings
    :param group: comma-separated string of columns to group chart data by
    :type group: str, optional
    :param agg: points to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: str, optional
    :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots)
    :type allow_duplicates: bool, optional
    :return: dict
    """

    data, code = retrieve_chart_data(raw_data,
                                     x,
                                     y,
                                     kwargs.get('z'),
                                     group_col,
                                     group_val=group_val)
    x_col = str('x')
    y_cols = make_list(y)
    z_col = kwargs.get('z')
    z_cols = make_list(z_col)
    if group_col is not None and len(group_col):
        data = data.sort_values(group_col + [x])
        code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
            cols="', '".join(group_col + [x])))
        check_all_nan(data, [x] + y_cols)
        data = data.rename(columns={x: x_col})
        code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                    x_col + "'})")
        if agg is not None and agg != 'raw':
            data = data.groupby(group_col + [x_col])
            data = getattr(data, agg)().reset_index()
            code.append(
                "chart_data = chart_data.groupby(['{cols}']).{agg}().reset_index()"
                .format(cols="', '".join(group_col + [x]), agg=agg))
        MAX_GROUPS = 30
        group_vals = data[group_col].drop_duplicates()
        if len(group_vals) > MAX_GROUPS:
            dtypes = get_dtypes(group_vals)
            group_fmt_overrides = {
                'I':
                lambda v, as_string: json_int(v, as_string=as_string, fmt='{}')
            }
            group_fmts = {
                c: find_dtype_formatter(dtypes[c],
                                        overrides=group_fmt_overrides)
                for c in group_col
            }

            group_f, _ = build_formatters(group_vals)
            group_vals = group_f.format_lists(group_vals)
            group_vals = pd.DataFrame(group_vals, columns=group_col)
            msg = (
                'Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. '
                'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) '
                'are listed below:').format(', '.join(group_col), MAX_GROUPS,
                                            group_vals.to_string(index=False))
            raise ChartBuildingError(msg, group_vals.to_string(index=False))

        data = data.dropna()
        if return_raw:
            return data.rename(columns={x_col: x})
        code.append("chart_data = chart_data.dropna()")
        data_f, range_f = build_formatters(data)
        ret_data = dict(
            data={},
            min={
                col: fmt(data[col].min(), None)
                for _, col, fmt in range_f.fmts if col in [x_col] + y_cols
            },
            max={
                col: fmt(data[col].max(), None)
                for _, col, fmt in range_f.fmts if col in [x_col] + y_cols
            },
        )

        dtypes = get_dtypes(data)
        group_fmt_overrides = {
            'I':
            lambda v, as_string: json_int(v, as_string=as_string, fmt='{}')
        }
        group_fmts = {
            c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides)
            for c in group_col
        }
        for group_val, grp in data.groupby(group_col):

            def _group_filter():
                for gv, gc in zip(make_list(group_val), group_col):
                    classifier = classify_type(dtypes[gc])
                    yield group_filter_handler(
                        gc, group_fmts[gc](gv, as_string=True), classifier)

            group_filter = ' and '.join(list(_group_filter()))
            ret_data['data'][group_filter] = data_f.format_lists(grp)
        return ret_data, code
    sort_cols = [x] + (y_cols if len(z_cols) else [])
    data = data.sort_values(sort_cols)
    code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
        cols="', '".join(sort_cols)))
    check_all_nan(data, [x] + y_cols + z_cols)
    y_cols = [str(y_col) for y_col in y_cols]
    data.columns = [x_col] + y_cols + z_cols
    code.append("chart_data.columns = ['{cols}']".format(
        cols="', '".join([x_col] + y_cols + z_cols)))
    if agg is not None:
        data, agg_code = build_agg_data(data,
                                        x_col,
                                        y_cols,
                                        kwargs,
                                        agg,
                                        z=z_col)
        code += agg_code
    data = data.dropna()
    if return_raw:
        return data.rename(columns={x_col: x})
    code.append("chart_data = chart_data.dropna()")

    dupe_cols = [x_col] + (y_cols if len(z_cols) else [])
    check_exceptions(data[dupe_cols].rename(columns={'x': x}),
                     allow_duplicates or agg == 'raw',
                     unlimited_data=unlimited_data,
                     data_limit=40000 if len(z_cols) else 15000)
    data_f, range_f = build_formatters(data)
    ret_data = dict(data={str('all'): data_f.format_lists(data)},
                    min={
                        col: fmt(data[col].min(), None)
                        for _, col, fmt in range_f.fmts
                        if col in [x_col] + y_cols + z_cols
                    },
                    max={
                        col: fmt(data[col].max(), None)
                        for _, col, fmt in range_f.fmts
                        if col in [x_col] + y_cols + z_cols
                    })
    return ret_data, code
Beispiel #18
0
def heatmap_builder(data_id, **inputs):
    """
    Builder function for :plotly:`plotly.graph_objects.Heatmap <plotly.graph_objects.Heatmap>`

    :param data_id: integer string identifier for a D-Tale process's data
    :type data_id: str
    :param inputs: Optional keyword arguments containing the following information:
        - x: column to be used as x-axis of chart
        - y: column to be used as y-axis of chart
        - z: column to use for the Z-Axis
        - agg: points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy
    :type inputs: dict
    :return: heatmap
    :rtype: :plotly:`plotly.graph_objects.Heatmap <plotly.graph_objects.Heatmap>`
    """

    try:
        if not valid_chart(**inputs):
            return None
        raw_data = global_state.get_data(data_id)
        wrapper = chart_wrapper(data_id, raw_data, inputs)
        hm_kwargs = dict(hoverongaps=False,
                         colorscale='Greens',
                         showscale=True,
                         hoverinfo='x+y+z')
        x, y, z, agg = (inputs.get(p) for p in ['x', 'y', 'z', 'agg'])
        y = y[0]
        data = retrieve_chart_data(raw_data, x, y, z)
        x_title = update_label_for_freq(x)
        y_title = update_label_for_freq(y)
        z_title = z
        data = data.sort_values([x, y])
        check_all_nan(data)
        dupe_cols = [x, y]
        if agg is not None:
            z_title = '{} ({})'.format(z_title, AGGS[agg])
            if agg == 'corr':
                data = data.dropna()
                data = data.set_index([x, y]).unstack().corr()
                data = data.stack().reset_index(0, drop=True)
                y_title = x_title
                dupe_cols = [
                    '{}{}'.format(col, i)
                    for i, col in enumerate(data.index.names)
                ]
                [x, y] = dupe_cols
                data.index.names = dupe_cols
                data = data.reset_index()
                data.loc[data[x] == data[y], z] = np.nan
                hm_kwargs = dict_merge(
                    hm_kwargs,
                    dict(colorscale=[[0, 'red'], [0.5, 'yellow'],
                                     [1.0, 'green']],
                         zmin=-1,
                         zmax=1))
            else:
                data = build_agg_data(data, x, y, inputs, agg, z=z)
        if not len(data):
            raise Exception('No data returned for this computation!')
        check_exceptions(
            data[dupe_cols],
            agg != 'corr',
            data_limit=40000,
            limit_msg=
            'Heatmap exceeds {} cells, cannot render. Please apply filter...')
        dtypes = {
            c: classify_type(dtype)
            for c, dtype in get_dtypes(data).items()
        }
        data_f, _ = chart_formatters(data)
        data = data_f.format_df(data)
        data = data.sort_values([x, y])
        data = data.set_index([x, y])
        data = data.unstack(0)[z]

        x_data = weekday_tick_handler(data.columns, x)
        y_data = weekday_tick_handler(data.index.values, y)
        heat_data = data.values

        x_axis = dict_merge({
            'title': x_title,
            'tickangle': -20
        }, build_spaced_ticks(x_data))
        if dtypes.get(x) == 'I':
            x_axis['tickformat'] = '.0f'

        y_axis = dict_merge({
            'title': y_title,
            'tickangle': -20
        }, build_spaced_ticks(y_data))
        if dtypes.get(y) == 'I':
            y_axis['tickformat'] = '.0f'

        hovertemplate = ''.join([
            x_title, ': %{customdata[0]}<br>', y_title,
            ': %{customdata[1]}<br>', z_title, ': %{z}<extra></extra>'
        ])
        hm_kwargs = dict_merge(
            hm_kwargs,
            dict(z=heat_data,
                 colorbar={'title': z_title},
                 hoverinfo='x+y+z',
                 hovertemplate=hovertemplate,
                 customdata=[[[xd, yd] for xd in x_data] for yd in y_data]))
        return wrapper(
            dcc.Graph(id='heatmap-graph-{}'.format(y),
                      style={
                          'margin-right': 'auto',
                          'margin-left': 'auto',
                          'height': 600
                      },
                      figure=dict(data=[go.Heatmap(**hm_kwargs)],
                                  layout=build_layout(
                                      dict_merge(
                                          dict(xaxis=x_axis,
                                               yaxis=y_axis,
                                               xaxis_zeroline=False,
                                               yaxis_zeroline=False),
                                          build_title(x, y, z=z, agg=agg))))))
    except BaseException as e:
        return build_error(str(e), str(traceback.format_exc()))
Beispiel #19
0
def build_axes(data_id, x, axis_inputs, mins, maxs, z=None, agg=None):
    """
    Returns helper function for building axis configurations against a specific y-axis.

    :param data_id: identifier of data to build axis configurations against
    :type data_id: str
    :param x: column to be used as x-axis of chart
    :type x: str
    :param axis_inputs: current settings for y-axis limits
    :type axis_inputs: dict
    :param mins: minimums for all columns involved in chart
    :type mins: dict
    :param maxs: maximums for all columns invloved in chart
    :param maxs: dict
    :param z: column to use for the Z-Axis
    :type z: str, optional
    :param agg: specific aggregation that can be applied to y or z axes.  Possible values are: count, first, last mean,
                median, min, max, std, var, mad, prod, sum.  This is included in label of axis it is being applied to.
    :type agg: str, optional
    :return: handler function to be applied against each y-axis used in chart
    :rtype: func
    """
    data = global_state.get_data(data_id)
    dtypes = get_dtypes(data)

    def _build_axes(y):
        axes = {'xaxis': dict(title=update_label_for_freq(x))}
        positions = []
        for i, y2 in enumerate(y, 0):
            right = i % 2 == 1
            axis_ct = int(i / 2)
            title = update_label_for_freq(y2)
            if z is None and agg is not None:
                title = '{} ({})'.format(title, AGGS[agg])
            value = dict(title=title)
            if i == 0:
                key = 'yaxis'
            else:
                key = 'yaxis{}'.format(i + 1)
                value = dict_merge(
                    value,
                    dict(overlaying='y', side='right' if right else 'left'))
                value['anchor'] = 'free' if axis_ct > 0 else 'x'
                if axis_ct > 0:
                    pos = axis_ct / 20.0
                    value['position'] = (1 - pos) if right else pos
                    positions.append(value['position'])
            if y2 in axis_inputs and not (axis_inputs[y2]['min'],
                                          axis_inputs[y2]['max']) == (
                                              mins[y2], maxs[y2]):
                value['range'] = [
                    axis_inputs[y2]['min'], axis_inputs[y2]['max']
                ]
            if classify_type(dtypes.get(y2)) == 'I':
                value['tickformat'] = '.0f'
            axes[key] = value
        if len(positions):
            if len(positions) == 1:
                domain = [positions[0] + 0.05, 1]
            elif len(positions) == 2:
                domain = sorted(positions)
                domain = [domain[0] + 0.05, domain[1] - 0.05]
            else:
                lower, upper = divide_chunks(sorted(positions), 2)
                domain = [lower[-1] + 0.05, upper[0] - 0.05]
            axes['xaxis']['domain'] = domain
        if classify_type(dtypes.get(x)) == 'I':
            axes['xaxis']['tickformat'] = '.0f'
        if z is not None:
            axes['zaxis'] = dict(
                title=z if agg is None else '{} ({})'.format(z, AGGS[agg]))
            if classify_type(dtypes.get(z)) == 'I':
                axes['zaxis']['tickformat'] = '.0f'
        return axes

    return _build_axes
Beispiel #20
0
def build_chart(data, x, y, group_col=None, agg=None, allow_duplicates=False, **kwargs):
    """
    Helper function to return data for 'chart-data' & 'correlations-ts' endpoints.  Will return a dictionary of
    dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum &
    maximum of all the series for the y-axis.  If there is only one series (no group_col specified) the only key in the
    dictionary of series data will be 'all' otherwise the keys will be the values of the groups.

    :param data: dataframe to be used for chart
    :type data: :class:`pandas:pandas.DataFrame`
    :param x: column to be used as x-axis of chart
    :type x: str
    :param y: column to be used as y-axis of chart
    :type y: list of strings
    :param group: comma-separated string of columns to group chart data by
    :type group: str, optional
    :param aggregation: points to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type aggregation: str, optional
    :return: dict
    """

    def build_formatters(df):
        cols = grid_columns(df)
        data_f = grid_formatter(cols, nan_display=None)
        overrides = {'F': lambda f, i, c: f.add_float(i, c, precision=2)}
        range_f = grid_formatter(cols, overrides=overrides, nan_display=None)
        return data_f, range_f

    def check_all_nan(df, cols):
        for col in cols:
            if df[col].isnull().all():
                raise Exception('All data for column "{}" is NaN!'.format(col))

    x_col = str('x')
    y_cols = make_list(y)
    if group_col is not None:
        data = data[group_col + [x] + y_cols].sort_values(group_col + [x])
        check_all_nan(data, [x] + y_cols)
        y_cols = [str(y_col) for y_col in y_cols]
        data.columns = group_col + [x_col] + y_cols
        if agg is not None:
            data = data.groupby(group_col + [x_col])
            data = getattr(data, agg)().reset_index()
        max_groups = 15
        if len(data[group_col].drop_duplicates()) > max_groups:
            msg = (
                'Group ({}) contains more than {} unique values, please add additional filter'
                ' or else chart will be unreadable'
            ).format(', '.join(group_col), max_groups)
            raise Exception(msg)

        data_f, range_f = build_formatters(data[[x_col] + y_cols])
        ret_data = dict(
            data={},
            min={col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols},
            max={col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols},
        )
        dtypes = get_dtypes(data)
        group_fmts = {c: find_dtype_formatter(dtypes[c]) for c in group_col}
        for group_val, grp in data.groupby(group_col):
            group_val = '/'.join([
                group_fmts[gc](gv, as_string=True) for gv, gc in zip(make_list(group_val), group_col)
            ])
            ret_data['data'][group_val] = data_f.format_lists(grp)
        return ret_data
    data = data[[x] + y_cols].sort_values(x)
    check_all_nan(data, [x] + y_cols)
    y_cols = [str(y_col) for y_col in y_cols]
    data.columns = [x_col] + y_cols
    if agg is not None:
        if agg == 'rolling':
            window, comp = map(kwargs.get, ['rolling_win', 'rolling_comp'])
            data = data.set_index(x_col).rolling(window=window)
            data = pd.DataFrame({c: getattr(data[c], comp)() for c in y_cols})
            data = data.reset_index()
        else:
            data = data.groupby(x_col)
            data = getattr(data[y_cols], agg)().reset_index()

    if not allow_duplicates and any(data[x_col].duplicated()):
        raise Exception('{} contains duplicates, please specify group or additional filtering'.format(x))
    if len(data) > 15000:
        raise Exception('Dataset exceeds 15,000 records, cannot render. Please apply filter...')
    data_f, range_f = build_formatters(data)
    ret_data = dict(
        data={str('all'): data_f.format_lists(data)},
        min={col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols},
        max={col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols},
    )
    return ret_data
Beispiel #21
0
def build_base_chart(raw_data,
                     x,
                     y,
                     group_col=None,
                     group_type=None,
                     group_val=None,
                     bins_val=None,
                     bin_type=None,
                     agg=None,
                     extended_aggregation=[],
                     allow_duplicates=False,
                     return_raw=False,
                     unlimited_data=False,
                     animate_by=None,
                     cleaners=[],
                     **kwargs):
    """
    Helper function to return data for 'chart-data' & 'correlations-ts' endpoints.  Will return a dictionary of
    dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum &
    maximum of all the series for the y-axis.  If there is only one series (no group_col specified) the only key in the
    dictionary of series data will be 'all' otherwise the keys will be the values of the groups.

    :param raw_data: dataframe to be used for chart
    :type raw_data: :class:`pandas:pandas.DataFrame`
    :param x: column to be used as x-axis of chart
    :type x: str
    :param y: column to be used as y-axis of chart
    :type y: list of strings
    :param group: comma-separated string of columns to group chart data by
    :type group: str, optional
    :param agg: points to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: str, optional
    :param extended_aggregation: list of configurations that point to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: list, optional
    :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots)
    :type allow_duplicates: bool, optional
    :return: dict
    """
    group_fmt_overrides = {
        "I": lambda v, as_string: json_int(v, as_string=as_string, fmt="{}")
    }
    data, code = retrieve_chart_data(raw_data,
                                     x,
                                     y,
                                     kwargs.get("z"),
                                     group_col,
                                     animate_by,
                                     group_val=group_val)
    cleaners = cleaners or []
    if len(cleaners):
        for col in data.columns:
            if classify_type(find_dtype(data[col])) == "S":
                code.append("s = chart_data['{}']".format(col))
                cleaned_col, cleaned_code = handle_cleaners(
                    data[col], ",".join(cleaners))
                data.loc[:, col] = cleaned_col
                code += cleaned_code
                code.append("chart_data.loc[:, '{}'] = s".format(col))

    x_col = str("x")
    if x is None:
        x = x_col
        data.loc[:, x_col] = range(1,
                                   len(data) +
                                   1)  # sequential integers: 1, 2, ..., N
    y_cols = make_list(y)

    z_col = kwargs.get("z")
    z_cols = make_list(z_col)
    y_cols = [str(col) for col in y_cols]
    is_z = len(z_cols) > 0
    y_group_cols = y_cols if is_z else []
    sort_cols = y_group_cols
    final_cols = y_cols + z_cols
    if group_col is not None and len(group_col):
        for col in make_list(group_col):
            classifier = classify_type(find_dtype(data[col]))
            if classifier == "F" or (classifier == "I"
                                     and group_type == "bins"):
                if bin_type == "width":
                    data.loc[:, col] = pd.qcut(data[col],
                                               q=bins_val,
                                               duplicates="drop").astype("str")
                    code.append((
                        "chart_data.loc[:, '{col}'] = pd.qcut(chart_data['{col}'], q={bins}, duplicates=\"drop\")"
                    ).format(col=col, bins=bins_val))
                else:
                    bins_data = data[col].dropna()
                    npt = len(bins_data)
                    equal_freq_bins = np.interp(
                        np.linspace(0, npt, bins_val + 1),
                        np.arange(npt),
                        np.sort(bins_data),
                    )
                    data.loc[:, col] = pd.cut(data[col],
                                              bins=equal_freq_bins,
                                              duplicates="drop").astype("str")
                    code.append((
                        "bins_data = data['{col}'].dropna()\n"
                        "npt = len(bins_data)\n"
                        "equal_freq_bins = np.interp(np.linspace(0, npt, {bins}), np.arange(npt), "
                        "np.sort(bins_data))\n"
                        "chart_data.loc[:, '{col}'] = pd.cut(chart_data['{col}'], bins=equal_freq_bins, "
                        'duplicates="drop")').format(col=col,
                                                     bins=bins_val + 1))

        main_group = group_col
        if animate_by is not None:
            main_group = [animate_by] + main_group
        sort_cols = main_group + [x] + sort_cols
        data = data.sort_values(sort_cols)
        code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
            cols="', '".join(sort_cols)))
        check_all_nan(data)
        data = data.rename(columns={x: x_col})
        code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                    x_col + "'})")

        if agg is not None or len(extended_aggregation):
            data, agg_code, final_cols = build_agg_data(
                data,
                x_col,
                y_cols,
                kwargs,
                agg,
                z=z_col,
                group_col=group_col,
                animate_by=animate_by,
                extended_aggregation=extended_aggregation,
            )
            code += agg_code

        group_vals = data[group_col].drop_duplicates()
        if len(group_vals) > MAX_GROUPS:
            dtypes = get_dtypes(group_vals)
            group_fmts = {
                c: find_dtype_formatter(dtypes[c],
                                        overrides=group_fmt_overrides)
                for c in group_col
            }

            group_f, _ = build_formatters(group_vals)
            group_vals = group_f.format_lists(group_vals)
            group_vals = pd.DataFrame(group_vals, columns=group_col)
            msg = (
                "Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. "
                'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) '
                "are listed below:\n\n{}").format(
                    ", ".join(group_col), MAX_GROUPS,
                    group_vals.to_string(index=False))
            raise ChartBuildingError(msg, group_vals.to_string(index=False))

        data = data.dropna()
        if return_raw:
            return data.rename(columns={x_col: x})
        code.append("chart_data = chart_data.dropna()")
        data_f, range_f = build_formatters(data)
        ret_data = dict(
            data={},
            min={
                col: fmt(data[col].min(), None)
                for _, col, fmt in range_f.fmts if col in [x_col] + final_cols
            },
            max={
                col: fmt(data[col].max(), None)
                for _, col, fmt in range_f.fmts if col in [x_col] + final_cols
            },
        )

        dtypes = get_dtypes(data)
        group_fmts = {
            c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides)
            for c in group_col
        }

        def _load_groups(df):
            for group_val, grp in df.groupby(group_col):

                def _group_filter():
                    for gv, gc in zip(make_list(group_val), group_col):
                        classifier = classify_type(dtypes[gc])
                        yield group_filter_handler(
                            gc, group_fmts[gc](gv, as_string=True), classifier)

                final_group_filter, final_group_label = [], []
                for gf, gl in _group_filter():
                    final_group_filter.append(gf)
                    final_group_label.append(gl)
                group_filter = " and ".join(final_group_filter)
                group_label = "({})".format(", ".join(final_group_label))
                data = data_f.format_lists(grp)
                data["_filter_"] = group_filter
                yield group_label, data

        if animate_by is not None:
            frame_fmt = find_dtype_formatter(dtypes[animate_by],
                                             overrides=group_fmt_overrides)
            ret_data["frames"] = []
            for frame_key, frame in data.sort_values(animate_by).groupby(
                    animate_by):
                ret_data["frames"].append(
                    dict(
                        data=dict(_load_groups(frame)),
                        name=frame_fmt(frame_key, as_string=True),
                    ))
            ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"])
        else:
            ret_data["data"] = dict(_load_groups(data))
        return ret_data, code

    main_group = [x]
    if animate_by is not None:
        main_group = [animate_by] + main_group
    sort_cols = main_group + sort_cols
    data = data.sort_values(sort_cols)
    code.append("chart_data = chart_data.sort_values(['{cols}'])".format(
        cols="', '".join(sort_cols)))
    check_all_nan(data)
    data = data[main_group + final_cols]

    data = data.rename(columns={x: x_col})
    main_group = [x_col if c == x else c for c in main_group]
    code.append("chart_data = chart_data.rename(columns={'" + x + "': '" +
                x_col + "'})")

    # convert booleans into integers for aggregation
    for col in z_cols or y_cols:
        classifier = classify_type(find_dtype(data[col]))
        if classifier == "B":
            data.loc[:, col] = data[col].astype("int")

    if agg is not None or len(extended_aggregation):
        data, agg_code, final_cols = build_agg_data(
            data,
            x_col,
            y_cols,
            kwargs,
            agg,
            z=z_col,
            animate_by=animate_by,
            extended_aggregation=extended_aggregation,
        )
        code += agg_code
    data = data.dropna()

    if return_raw:
        return data.rename(columns={x_col: x})
    code.append("chart_data = chart_data.dropna()")

    dupe_cols = main_group + y_group_cols
    data_limit = global_state.get_chart_settings(
    )["3d_points" if is_z or animate_by is not None else "scatter_points"]
    check_exceptions(
        data[dupe_cols].rename(columns={x_col: x}),
        allow_duplicates or agg in ["raw", "drop_duplicates"],
        unlimited_data=unlimited_data,
        data_limit=data_limit,
    )
    data_f, range_f = build_formatters(data)

    ret_data = dict(
        min={
            col: fmt(data[col].min(), None)
            for _, col, fmt in range_f.fmts
            if col in [x_col] + y_group_cols + final_cols
        },
        max={
            col: fmt(data[col].max(), None)
            for _, col, fmt in range_f.fmts
            if col in [x_col] + y_group_cols + final_cols
        },
    )
    if animate_by is not None:
        frame_fmt = find_dtype_formatter(find_dtype(data[animate_by]),
                                         overrides=group_fmt_overrides)
        ret_data["frames"] = []
        for frame_key, frame in data.sort_values(animate_by).groupby(
                animate_by):
            ret_data["frames"].append(
                dict(
                    data={str("all"): data_f.format_lists(frame)},
                    name=frame_fmt(frame_key, as_string=True),
                ))
        ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"])
    else:
        ret_data["data"] = {str("all"): data_f.format_lists(data)}
    return ret_data, code
Beispiel #22
0
def build_chart(raw_data, x, y, group_col=None, agg=None, allow_duplicates=False, **kwargs):
    """
    Helper function to return data for 'chart-data' & 'correlations-ts' endpoints.  Will return a dictionary of
    dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum &
    maximum of all the series for the y-axis.  If there is only one series (no group_col specified) the only key in the
    dictionary of series data will be 'all' otherwise the keys will be the values of the groups.

    :param raw_data: dataframe to be used for chart
    :type raw_data: :class:`pandas:pandas.DataFrame`
    :param x: column to be used as x-axis of chart
    :type x: str
    :param y: column to be used as y-axis of chart
    :type y: list of strings
    :param group: comma-separated string of columns to group chart data by
    :type group: str, optional
    :param agg: points to a specific function that can be applied to
                        :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                        median, min, max, std, var, mad, prod, sum
    :type agg: str, optional
    :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots)
    :type allow_duplicates: bool, optional
    :return: dict
    """

    data, code = retrieve_chart_data(raw_data, x, y, kwargs.get('z'), group_col)
    x_col = str('x')
    y_cols = make_list(y)
    z_col = kwargs.get('z')
    z_cols = []
    if z_col is not None:
        z_cols = [z_col]
    if group_col is not None and len(group_col):
        data = data.sort_values(group_col + [x])
        code.append("chart_data = chart_data.sort_values(['{cols}'])".format(cols="', '".join(group_col + [x])))
        check_all_nan(data, [x] + y_cols)
        data = data.rename(columns={x: x_col})
        code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})")
        if agg is not None:
            data = data.groupby(group_col + [x_col])
            data = getattr(data, agg)().reset_index()
            code.append("chart_data = chart_data.groupby(['{cols}']).{agg}().reset_index()".format(
                cols="', '".join(group_col + [x]), agg=agg
            ))
        max_groups = 15
        if len(data[group_col].drop_duplicates()) > max_groups:
            msg = (
                'Group ({}) contains more than {} unique values, please add additional filter'
                ' or else chart will be unreadable'
            ).format(', '.join(group_col), max_groups)
            raise Exception(msg)

        data = data.dropna()
        code.append("chart_data = chart_data.dropna()")
        data_f, range_f = build_formatters(data)
        ret_data = dict(
            data={},
            min={col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols},
            max={col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols},
        )

        dtypes = get_dtypes(data)
        group_fmt_overrides = {'I': lambda v, as_string: json_int(v, as_string=as_string, fmt='{}')}
        group_fmts = {c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col}
        for group_val, grp in data.groupby(group_col):
            group_val = '/'.join([
                group_fmts[gc](gv, as_string=True) for gv, gc in zip(make_list(group_val), group_col)
            ])
            ret_data['data'][group_val] = data_f.format_lists(grp)
        ret_data['dtypes'] = {c: classify_type(dtype) for c, dtype in dtypes.items()}
        return ret_data, code
    sort_cols = [x] + (y_cols if len(z_cols) else [])
    data = data.sort_values(sort_cols)
    code.append("chart_data = chart_data.sort_values(['{cols}'])".format(cols="', '".join(sort_cols)))
    check_all_nan(data, [x] + y_cols + z_cols)
    y_cols = [str(y_col) for y_col in y_cols]
    data.columns = [x_col] + y_cols + z_cols
    code.append("chart_data.columns = ['{cols}']".format(cols="', '".join([x_col] + y_cols + z_cols)))
    if agg is not None:
        data, agg_code = build_agg_data(data, x_col, y_cols, kwargs, agg, z=z_col)
        code += agg_code
    data = data.dropna()
    code.append("chart_data = chart_data.dropna()")

    dupe_cols = [x_col] + (y_cols if len(z_cols) else [])
    check_exceptions(data[dupe_cols].rename(columns={'x': x}), allow_duplicates,
                     data_limit=40000 if len(z_cols) else 15000)
    data_f, range_f = build_formatters(data)
    ret_data = dict(
        data={str('all'): data_f.format_lists(data)},
        min={col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols},
        max={col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols}
    )
    return ret_data, code