def load_describe(column_series, additional_aggs=None): """ Helper function for grabbing the output from :meth:`pandas:pandas.Series.describe` in a JSON serializable format :param column_series: data to describe :type column_series: :class:`pandas:pandas.Series` :return: JSON serializable dictionary of the output from calling :meth:`pandas:pandas.Series.describe` """ desc = column_series.describe().to_frame().T code = [ "# main statistics", "stats = df['{col}'].describe().to_frame().T".format(col=column_series.name), ] if additional_aggs: for agg in additional_aggs: if agg == "mode": mode = column_series.mode().values desc["mode"] = np.nan if len(mode) > 1 else mode[0] code.append( ( "# mode\n" "mode = df['{col}'].mode().values\n" "stats['mode'] = np.nan if len(mode) > 1 else mode[0]" ).format(col=column_series.name) ) continue desc[agg] = getattr(column_series, agg)() code.append( "# {agg}\nstats['{agg}'] = df['{col}'].{agg}()".format( col=column_series.name, agg=agg ) ) desc_f_overrides = { "I": lambda f, i, c: f.add_int(i, c, as_string=True), "F": lambda f, i, c: f.add_float(i, c, precision=4, as_string=True), } desc_f = grid_formatter( grid_columns(desc), nan_display="nan", overrides=desc_f_overrides ) desc = desc_f.format_dict(next(desc.itertuples(), None)) if "count" in desc: # pandas always returns 'count' as a float and it adds useless decimal points desc["count"] = desc["count"].split(".")[0] desc["total_count"] = json_int(len(column_series), as_string=True) missing_ct = column_series.isnull().sum() desc["missing_pct"] = json_float((missing_ct / len(column_series) * 100).round(2)) desc["missing_ct"] = json_int(missing_ct, as_string=True) return desc, code
def describe(column): """ Flask route which returns standard details about column data using :meth:`pandas:pandas.DataFrame.describe` to the front-end as JSON :param column: required dash separated string "START-END" stating a range of row indexes to be returned to the screen :return: JSON { describe: object representing output from :meth:`pandas:pandas.Series.describe`, unique_data: array of unique values when data has <= 100 unique values success: True/False } """ try: data = DATA[get_port()] desc = load_describe(data[column]) return_data = dict(describe=desc, success=True) uniq_vals = data[column].unique() if 'unique' not in return_data['describe']: return_data['describe']['unique'] = json_int(len(uniq_vals), as_string=True) if len(uniq_vals) <= 100: uniq_f = find_dtype_formatter(get_dtypes(data)[column]) return_data['uniques'] = [ uniq_f(u, nan_display='N/A') for u in uniq_vals ] return jsonify(return_data) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def describe(data_id, column): """ :class:`flask:flask.Flask` route which returns standard details about column data using :meth:`pandas:pandas.DataFrame.describe` to the front-end as JSON :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param column: required dash separated string "START-END" stating a range of row indexes to be returned to the screen :return: JSON { describe: object representing output from :meth:`pandas:pandas.Series.describe`, unique_data: array of unique values when data has <= 100 unique values success: True/False } """ try: data = DATA[data_id][[column]] additional_aggs = None dtype = next((dtype_info['dtype'] for dtype_info in DTYPES[data_id] if dtype_info['name'] == column), None) if classify_type(dtype) in ['I', 'F']: additional_aggs = [ 'sum', 'median', 'mode', 'var', 'sem', 'skew', 'kurt' ] desc = load_describe(data[column], additional_aggs=additional_aggs) return_data = dict(describe=desc, success=True) uniq_vals = data[column].unique() if 'unique' not in return_data['describe']: return_data['describe']['unique'] = json_int(len(uniq_vals), as_string=True) uniq_f = find_dtype_formatter(get_dtypes(data)[column]) if len(uniq_vals) <= 100: return_data['uniques'] = dict(data=[uniq_f(u) for u in uniq_vals], top=False) else: # get top 100 most common values uniq_vals = data[column].value_counts().sort_values( ascending=False).head(100).index.values return_data['uniques'] = dict(data=[uniq_f(u) for u in uniq_vals], top=True) return jsonify(return_data) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def build_base_chart(raw_data, x, y, group_col=None, group_val=None, agg=None, allow_duplicates=False, return_raw=False, unlimited_data=False, animate_by=None, **kwargs): """ Helper function to return data for 'chart-data' & 'correlations-ts' endpoints. Will return a dictionary of dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum & maximum of all the series for the y-axis. If there is only one series (no group_col specified) the only key in the dictionary of series data will be 'all' otherwise the keys will be the values of the groups. :param raw_data: dataframe to be used for chart :type raw_data: :class:`pandas:pandas.DataFrame` :param x: column to be used as x-axis of chart :type x: str :param y: column to be used as y-axis of chart :type y: list of strings :param group: comma-separated string of columns to group chart data by :type group: str, optional :param agg: points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: str, optional :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots) :type allow_duplicates: bool, optional :return: dict """ group_fmt_overrides = { "I": lambda v, as_string: json_int(v, as_string=as_string, fmt="{}") } data, code = retrieve_chart_data(raw_data, x, y, kwargs.get("z"), group_col, animate_by, group_val=group_val) x_col = str("x") if x is None: x = x_col data.loc[:, x_col] = range(1, len(data) + 1) # sequential integers: 1, 2, ..., N y_cols = make_list(y) z_col = kwargs.get("z") z_cols = make_list(z_col) sort_cols = y_cols if len(z_cols) else [] if group_col is not None and len(group_col): main_group = group_col if animate_by is not None: main_group = [animate_by] + main_group sort_cols = main_group + [x] + sort_cols data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data) data = data.rename(columns={x: x_col}) code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None: data, agg_code = build_agg_data( data, x_col, y_cols, kwargs, agg, z=z_col, group_col=group_col, animate_by=animate_by, ) code += agg_code MAX_GROUPS = 30 group_vals = data[group_col].drop_duplicates() if len(group_vals) > MAX_GROUPS: dtypes = get_dtypes(group_vals) group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } group_f, _ = build_formatters(group_vals) group_vals = group_f.format_lists(group_vals) group_vals = pd.DataFrame(group_vals, columns=group_col) msg = ( "Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. " 'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) ' "are listed below:\n\n{}").format( ", ".join(group_col), MAX_GROUPS, group_vals.to_string(index=False)) raise ChartBuildingError(msg, group_vals.to_string(index=False)) data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") data_f, range_f = build_formatters(data) ret_data = dict( data={}, min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, ) dtypes = get_dtypes(data) group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } def _load_groups(df): for group_val, grp in df.groupby(group_col): def _group_filter(): for gv, gc in zip(make_list(group_val), group_col): classifier = classify_type(dtypes[gc]) yield group_filter_handler( gc, group_fmts[gc](gv, as_string=True), classifier) group_filter = " and ".join(list(_group_filter())) yield group_filter, data_f.format_lists(grp) if animate_by is not None: frame_fmt = find_dtype_formatter(dtypes[animate_by], overrides=group_fmt_overrides) ret_data["frames"] = [] for frame_key, frame in data.sort_values(animate_by).groupby( animate_by): ret_data["frames"].append( dict( data=dict(_load_groups(frame)), name=frame_fmt(frame_key, as_string=True), )) ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"]) else: ret_data["data"] = dict(_load_groups(data)) return ret_data, code main_group = [x] if animate_by is not None: main_group = [animate_by] + main_group sort_cols = main_group + sort_cols data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data) y_cols = [str(y_col) for y_col in y_cols] data = data[main_group + y_cols + z_cols] data = data.rename(columns={x: x_col}) main_group = [x_col if c == x else c for c in main_group] code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None: data, agg_code = build_agg_data(data, x_col, y_cols, kwargs, agg, z=z_col, animate_by=animate_by) code += agg_code data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") dupe_cols = main_group + (y_cols if len(z_cols) else []) check_exceptions( data[dupe_cols].rename(columns={x_col: x}), allow_duplicates or agg in ["raw", "drop_duplicates"], unlimited_data=unlimited_data, data_limit=40000 if len(z_cols) or animate_by is not None else 15000, ) data_f, range_f = build_formatters(data) ret_data = dict( min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, ) if animate_by is not None: frame_fmt = find_dtype_formatter(find_dtype(data[animate_by]), overrides=group_fmt_overrides) ret_data["frames"] = [] for frame_key, frame in data.sort_values(animate_by).groupby( animate_by): ret_data["frames"].append( dict( data={str("all"): data_f.format_lists(frame)}, name=frame_fmt(frame_key, as_string=True), )) ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"]) else: ret_data["data"] = {str("all"): data_f.format_lists(data)} return ret_data, code
def build_base_chart(raw_data, x, y, group_col=None, group_type=None, group_val=None, bins_val=None, bin_type=None, agg=None, extended_aggregation=[], allow_duplicates=False, return_raw=False, unlimited_data=False, animate_by=None, cleaners=[], **kwargs): """ Helper function to return data for 'chart-data' & 'correlations-ts' endpoints. Will return a dictionary of dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum & maximum of all the series for the y-axis. If there is only one series (no group_col specified) the only key in the dictionary of series data will be 'all' otherwise the keys will be the values of the groups. :param raw_data: dataframe to be used for chart :type raw_data: :class:`pandas:pandas.DataFrame` :param x: column to be used as x-axis of chart :type x: str :param y: column to be used as y-axis of chart :type y: list of strings :param group: comma-separated string of columns to group chart data by :type group: str, optional :param agg: points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: str, optional :param extended_aggregation: list of configurations that point to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: list, optional :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots) :type allow_duplicates: bool, optional :return: dict """ group_fmt_overrides = { "I": lambda v, as_string: json_int(v, as_string=as_string, fmt="{}") } data, code = retrieve_chart_data(raw_data, x, y, kwargs.get("z"), group_col, animate_by, group_val=group_val) cleaners = cleaners or [] if len(cleaners): for col in data.columns: if classify_type(find_dtype(data[col])) == "S": code.append("s = chart_data['{}']".format(col)) cleaned_col, cleaned_code = handle_cleaners( data[col], ",".join(cleaners)) data.loc[:, col] = cleaned_col code += cleaned_code code.append("chart_data.loc[:, '{}'] = s".format(col)) x_col = str("x") if x is None: x = x_col data.loc[:, x_col] = range(1, len(data) + 1) # sequential integers: 1, 2, ..., N y_cols = make_list(y) z_col = kwargs.get("z") z_cols = make_list(z_col) y_cols = [str(col) for col in y_cols] is_z = len(z_cols) > 0 y_group_cols = y_cols if is_z else [] sort_cols = y_group_cols final_cols = y_cols + z_cols if group_col is not None and len(group_col): for col in make_list(group_col): classifier = classify_type(find_dtype(data[col])) if classifier == "F" or (classifier == "I" and group_type == "bins"): if bin_type == "width": data.loc[:, col] = pd.qcut(data[col], q=bins_val, duplicates="drop").astype("str") code.append(( "chart_data.loc[:, '{col}'] = pd.qcut(chart_data['{col}'], q={bins}, duplicates=\"drop\")" ).format(col=col, bins=bins_val)) else: bins_data = data[col].dropna() npt = len(bins_data) equal_freq_bins = np.interp( np.linspace(0, npt, bins_val + 1), np.arange(npt), np.sort(bins_data), ) data.loc[:, col] = pd.cut(data[col], bins=equal_freq_bins, duplicates="drop").astype("str") code.append(( "bins_data = data['{col}'].dropna()\n" "npt = len(bins_data)\n" "equal_freq_bins = np.interp(np.linspace(0, npt, {bins}), np.arange(npt), " "np.sort(bins_data))\n" "chart_data.loc[:, '{col}'] = pd.cut(chart_data['{col}'], bins=equal_freq_bins, " 'duplicates="drop")').format(col=col, bins=bins_val + 1)) main_group = group_col if animate_by is not None: main_group = [animate_by] + main_group sort_cols = main_group + [x] + sort_cols data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data) data = data.rename(columns={x: x_col}) code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None or len(extended_aggregation): data, agg_code, final_cols = build_agg_data( data, x_col, y_cols, kwargs, agg, z=z_col, group_col=group_col, animate_by=animate_by, extended_aggregation=extended_aggregation, ) code += agg_code group_vals = data[group_col].drop_duplicates() if len(group_vals) > MAX_GROUPS: dtypes = get_dtypes(group_vals) group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } group_f, _ = build_formatters(group_vals) group_vals = group_f.format_lists(group_vals) group_vals = pd.DataFrame(group_vals, columns=group_col) msg = ( "Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. " 'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) ' "are listed below:\n\n{}").format( ", ".join(group_col), MAX_GROUPS, group_vals.to_string(index=False)) raise ChartBuildingError(msg, group_vals.to_string(index=False)) data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") data_f, range_f = build_formatters(data) ret_data = dict( data={}, min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + final_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + final_cols }, ) dtypes = get_dtypes(data) group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } def _load_groups(df): for group_val, grp in df.groupby(group_col): def _group_filter(): for gv, gc in zip(make_list(group_val), group_col): classifier = classify_type(dtypes[gc]) yield group_filter_handler( gc, group_fmts[gc](gv, as_string=True), classifier) final_group_filter, final_group_label = [], [] for gf, gl in _group_filter(): final_group_filter.append(gf) final_group_label.append(gl) group_filter = " and ".join(final_group_filter) group_label = "({})".format(", ".join(final_group_label)) data = data_f.format_lists(grp) data["_filter_"] = group_filter yield group_label, data if animate_by is not None: frame_fmt = find_dtype_formatter(dtypes[animate_by], overrides=group_fmt_overrides) ret_data["frames"] = [] for frame_key, frame in data.sort_values(animate_by).groupby( animate_by): ret_data["frames"].append( dict( data=dict(_load_groups(frame)), name=frame_fmt(frame_key, as_string=True), )) ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"]) else: ret_data["data"] = dict(_load_groups(data)) return ret_data, code main_group = [x] if animate_by is not None: main_group = [animate_by] + main_group sort_cols = main_group + sort_cols data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data) data = data[main_group + final_cols] data = data.rename(columns={x: x_col}) main_group = [x_col if c == x else c for c in main_group] code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") # convert booleans into integers for aggregation for col in z_cols or y_cols: classifier = classify_type(find_dtype(data[col])) if classifier == "B": data.loc[:, col] = data[col].astype("int") if agg is not None or len(extended_aggregation): data, agg_code, final_cols = build_agg_data( data, x_col, y_cols, kwargs, agg, z=z_col, animate_by=animate_by, extended_aggregation=extended_aggregation, ) code += agg_code data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") dupe_cols = main_group + y_group_cols data_limit = global_state.get_chart_settings( )["3d_points" if is_z or animate_by is not None else "scatter_points"] check_exceptions( data[dupe_cols].rename(columns={x_col: x}), allow_duplicates or agg in ["raw", "drop_duplicates"], unlimited_data=unlimited_data, data_limit=data_limit, ) data_f, range_f = build_formatters(data) ret_data = dict( min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_group_cols + final_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_group_cols + final_cols }, ) if animate_by is not None: frame_fmt = find_dtype_formatter(find_dtype(data[animate_by]), overrides=group_fmt_overrides) ret_data["frames"] = [] for frame_key, frame in data.sort_values(animate_by).groupby( animate_by): ret_data["frames"].append( dict( data={str("all"): data_f.format_lists(frame)}, name=frame_fmt(frame_key, as_string=True), )) ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"]) else: ret_data["data"] = {str("all"): data_f.format_lists(data)} return ret_data, code
def build_chart(raw_data, x, y, group_col=None, agg=None, allow_duplicates=False, **kwargs): """ Helper function to return data for 'chart-data' & 'correlations-ts' endpoints. Will return a dictionary of dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum & maximum of all the series for the y-axis. If there is only one series (no group_col specified) the only key in the dictionary of series data will be 'all' otherwise the keys will be the values of the groups. :param raw_data: dataframe to be used for chart :type raw_data: :class:`pandas:pandas.DataFrame` :param x: column to be used as x-axis of chart :type x: str :param y: column to be used as y-axis of chart :type y: list of strings :param group: comma-separated string of columns to group chart data by :type group: str, optional :param agg: points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: str, optional :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots) :type allow_duplicates: bool, optional :return: dict """ data, code = retrieve_chart_data(raw_data, x, y, kwargs.get('z'), group_col) x_col = str('x') y_cols = make_list(y) z_col = kwargs.get('z') z_cols = [] if z_col is not None: z_cols = [z_col] if group_col is not None and len(group_col): data = data.sort_values(group_col + [x]) code.append("chart_data = chart_data.sort_values(['{cols}'])".format(cols="', '".join(group_col + [x]))) check_all_nan(data, [x] + y_cols) data = data.rename(columns={x: x_col}) code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None: data = data.groupby(group_col + [x_col]) data = getattr(data, agg)().reset_index() code.append("chart_data = chart_data.groupby(['{cols}']).{agg}().reset_index()".format( cols="', '".join(group_col + [x]), agg=agg )) max_groups = 15 if len(data[group_col].drop_duplicates()) > max_groups: msg = ( 'Group ({}) contains more than {} unique values, please add additional filter' ' or else chart will be unreadable' ).format(', '.join(group_col), max_groups) raise Exception(msg) data = data.dropna() code.append("chart_data = chart_data.dropna()") data_f, range_f = build_formatters(data) ret_data = dict( data={}, min={col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols}, max={col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols}, ) dtypes = get_dtypes(data) group_fmt_overrides = {'I': lambda v, as_string: json_int(v, as_string=as_string, fmt='{}')} group_fmts = {c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col} for group_val, grp in data.groupby(group_col): group_val = '/'.join([ group_fmts[gc](gv, as_string=True) for gv, gc in zip(make_list(group_val), group_col) ]) ret_data['data'][group_val] = data_f.format_lists(grp) ret_data['dtypes'] = {c: classify_type(dtype) for c, dtype in dtypes.items()} return ret_data, code sort_cols = [x] + (y_cols if len(z_cols) else []) data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format(cols="', '".join(sort_cols))) check_all_nan(data, [x] + y_cols + z_cols) y_cols = [str(y_col) for y_col in y_cols] data.columns = [x_col] + y_cols + z_cols code.append("chart_data.columns = ['{cols}']".format(cols="', '".join([x_col] + y_cols + z_cols))) if agg is not None: data, agg_code = build_agg_data(data, x_col, y_cols, kwargs, agg, z=z_col) code += agg_code data = data.dropna() code.append("chart_data = chart_data.dropna()") dupe_cols = [x_col] + (y_cols if len(z_cols) else []) check_exceptions(data[dupe_cols].rename(columns={'x': x}), allow_duplicates, data_limit=40000 if len(z_cols) else 15000) data_f, range_f = build_formatters(data) ret_data = dict( data={str('all'): data_f.format_lists(data)}, min={col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols}, max={col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols} ) return ret_data, code
def find_coverage(): """ Flask route which returns coverage information(counts) for a column grouped by other column(s) :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :param col: string from flask.request.args['col'] containing name of a column in your dataframe :param filters(deprecated): JSON string from flaks.request.args['filters'] with filtering information from group drilldown [ {name: col1, prevFreq: Y, freq: Q, date: YYYY-MM-DD}, ... {name: col1, prevFreq: D, freq: W, date: YYYY-MM-DD}, ] :param group: JSON string from flask.request.args['group'] containing grouping logic in this structure [ {name: col1} or {name: date_col1, freq: [D,W,M,Q,Y]} ] :returns: JSON { data: { [col]: [count1,count2,...,countN], labels: [{group_col1: gc1_v1, group_col2: gc2_v1},...,{group_col1: gc1_vN, group_col2: gc2_vN}], success: True } or {error: 'Exception message', traceback: 'Exception stacktrace', success: False} """ def filter_data(df, req, groups, query=None): filters = get_str_arg(req, 'filters') if not filters: return df.query(query or 'index == index'), groups, '' filters = json.loads(filters) col, prev_freq, freq, end = map(filters[-1].get, ['name', 'prevFreq', 'freq', 'date']) start = DATE_RANGES[prev_freq](pd.Timestamp(end)).strftime('%Y%m%d') range_query = "{col} >= '{start}' and {col} <= '{end}'".format( col=col, start=start, end=end) logger.info('filtered coverage data to slice: {}'.format(range_query)) updated_groups = [ dict(name=col, freq=freq) if g['name'] == col else g for g in groups ] return df.query( query or 'index == index').query(range_query), updated_groups, range_query try: col = get_str_arg(request, 'col') groups = get_str_arg(request, 'group') if groups: groups = json.loads(groups) data = DATA[get_port()] data, groups, query = filter_data(data, request, groups, query=get_str_arg(request, 'query')) grouper = [] for g_cfg in groups: if 'freq' in g_cfg: freq_grp = data.set_index([g_cfg['name']]).index.to_period( g_cfg['freq']).to_timestamp(how='end') freq_grp.name = g_cfg['name'] grouper.append(freq_grp) else: grouper.append(data[g_cfg['name']]) data_groups = data.groupby(grouper) group_data = data_groups[col].count() if len(groups) > 1: unstack_order = enumerate( zip(group_data.index.names, group_data.index.levels)) unstack_order = sorted([(uo[0], uo[1][0], len(uo[1][1])) for uo in unstack_order], key=lambda k: k[2]) for i, n, l in unstack_order[:-1]: group_data = group_data.unstack(i) group_data = group_data.fillna(0) if len(unstack_order[:-1]) > 1: group_data.columns = [ ', '.join([ str(group_data.columns.levels[c2[0]][c2[1]]) for c2 in enumerate(c) ]) for c in zip(*group_data.columns.labels) ] else: group_data.columns = map(str, group_data.columns.values) if len(group_data) > 15000: return jsonify( dict(error=( 'Your grouping created {} groups, chart will not render. ' 'Try making date columns a higher frequency (W, M, Q, Y)' ).format(len(data_groups)))) if len(groups) == 1: data = {col: [json_int(v) for v in group_data.values]} else: data = dict([(c, [json_int(v) for v in group_data[c].values]) for c in group_data.columns]) labels = pd.DataFrame(group_data.index.values, columns=group_data.index.names) labels_f_overrides = { 'D': lambda f, i, c: f.add_date(i, c, fmt='%Y-%m-%d'), } labels_f = grid_formatter(grid_columns(labels), overrides=labels_f_overrides) labels = labels_f.format_dicts(labels.itertuples()) return jsonify(data=data, labels=labels, success=True) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def build_base_chart(raw_data, x, y, group_col=None, group_val=None, agg=None, allow_duplicates=False, return_raw=False, unlimited_data=False, **kwargs): """ Helper function to return data for 'chart-data' & 'correlations-ts' endpoints. Will return a dictionary of dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum & maximum of all the series for the y-axis. If there is only one series (no group_col specified) the only key in the dictionary of series data will be 'all' otherwise the keys will be the values of the groups. :param raw_data: dataframe to be used for chart :type raw_data: :class:`pandas:pandas.DataFrame` :param x: column to be used as x-axis of chart :type x: str :param y: column to be used as y-axis of chart :type y: list of strings :param group: comma-separated string of columns to group chart data by :type group: str, optional :param agg: points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: str, optional :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots) :type allow_duplicates: bool, optional :return: dict """ data, code = retrieve_chart_data(raw_data, x, y, kwargs.get('z'), group_col, group_val=group_val) x_col = str('x') y_cols = make_list(y) z_col = kwargs.get('z') z_cols = make_list(z_col) if group_col is not None and len(group_col): data = data.sort_values(group_col + [x]) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(group_col + [x]))) check_all_nan(data, [x] + y_cols) data = data.rename(columns={x: x_col}) code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None and agg != 'raw': data = data.groupby(group_col + [x_col]) data = getattr(data, agg)().reset_index() code.append( "chart_data = chart_data.groupby(['{cols}']).{agg}().reset_index()" .format(cols="', '".join(group_col + [x]), agg=agg)) MAX_GROUPS = 30 group_vals = data[group_col].drop_duplicates() if len(group_vals) > MAX_GROUPS: dtypes = get_dtypes(group_vals) group_fmt_overrides = { 'I': lambda v, as_string: json_int(v, as_string=as_string, fmt='{}') } group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } group_f, _ = build_formatters(group_vals) group_vals = group_f.format_lists(group_vals) group_vals = pd.DataFrame(group_vals, columns=group_col) msg = ( 'Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. ' 'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) ' 'are listed below:').format(', '.join(group_col), MAX_GROUPS, group_vals.to_string(index=False)) raise ChartBuildingError(msg, group_vals.to_string(index=False)) data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") data_f, range_f = build_formatters(data) ret_data = dict( data={}, min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols }, ) dtypes = get_dtypes(data) group_fmt_overrides = { 'I': lambda v, as_string: json_int(v, as_string=as_string, fmt='{}') } group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } for group_val, grp in data.groupby(group_col): def _group_filter(): for gv, gc in zip(make_list(group_val), group_col): classifier = classify_type(dtypes[gc]) yield group_filter_handler( gc, group_fmts[gc](gv, as_string=True), classifier) group_filter = ' and '.join(list(_group_filter())) ret_data['data'][group_filter] = data_f.format_lists(grp) return ret_data, code sort_cols = [x] + (y_cols if len(z_cols) else []) data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data, [x] + y_cols + z_cols) y_cols = [str(y_col) for y_col in y_cols] data.columns = [x_col] + y_cols + z_cols code.append("chart_data.columns = ['{cols}']".format( cols="', '".join([x_col] + y_cols + z_cols))) if agg is not None: data, agg_code = build_agg_data(data, x_col, y_cols, kwargs, agg, z=z_col) code += agg_code data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") dupe_cols = [x_col] + (y_cols if len(z_cols) else []) check_exceptions(data[dupe_cols].rename(columns={'x': x}), allow_duplicates or agg == 'raw', unlimited_data=unlimited_data, data_limit=40000 if len(z_cols) else 15000) data_f, range_f = build_formatters(data) ret_data = dict(data={str('all'): data_f.format_lists(data)}, min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }) return ret_data, code