def check_exceptions( df, allow_duplicates, unlimited_data=False, data_limit=15000, limit_msg=LIMIT_MSG, dupes_msg=DUPES_MSG, ): """ Checker function to test the output of any chart aggregations to see if it is one of the following: - too large to be rendered by web client - contains duplicate data points which can't be rendered (ex: multiple points for a single point on the x-axis of a bar chart within the same series) :param df: dataframe whose data needs to be checked :type df: :class:`pandas:pandas.DataFrame` :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots) :type allow_duplicates: bool :param data_limit: maximum rows allowed for chart rendering (default: 15,000) :type data_limit: int, optional :param limit_msg: error message template :type limit_msg: str, optional :raises Exception: if any failure condition is met """ if not allow_duplicates and any(df.duplicated()): raise ChartBuildingError(dupes_msg.format(", ".join(df.columns))) if not unlimited_data and len(df) > data_limit: raise ChartBuildingError(limit_msg.format(data_limit))
def build_base_chart(raw_data, x, y, group_col=None, group_val=None, agg=None, allow_duplicates=False, return_raw=False, unlimited_data=False, animate_by=None, **kwargs): """ Helper function to return data for 'chart-data' & 'correlations-ts' endpoints. Will return a dictionary of dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum & maximum of all the series for the y-axis. If there is only one series (no group_col specified) the only key in the dictionary of series data will be 'all' otherwise the keys will be the values of the groups. :param raw_data: dataframe to be used for chart :type raw_data: :class:`pandas:pandas.DataFrame` :param x: column to be used as x-axis of chart :type x: str :param y: column to be used as y-axis of chart :type y: list of strings :param group: comma-separated string of columns to group chart data by :type group: str, optional :param agg: points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: str, optional :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots) :type allow_duplicates: bool, optional :return: dict """ group_fmt_overrides = { "I": lambda v, as_string: json_int(v, as_string=as_string, fmt="{}") } data, code = retrieve_chart_data(raw_data, x, y, kwargs.get("z"), group_col, animate_by, group_val=group_val) x_col = str("x") if x is None: x = x_col data.loc[:, x_col] = range(1, len(data) + 1) # sequential integers: 1, 2, ..., N y_cols = make_list(y) z_col = kwargs.get("z") z_cols = make_list(z_col) sort_cols = y_cols if len(z_cols) else [] if group_col is not None and len(group_col): main_group = group_col if animate_by is not None: main_group = [animate_by] + main_group sort_cols = main_group + [x] + sort_cols data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data) data = data.rename(columns={x: x_col}) code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None: data, agg_code = build_agg_data( data, x_col, y_cols, kwargs, agg, z=z_col, group_col=group_col, animate_by=animate_by, ) code += agg_code MAX_GROUPS = 30 group_vals = data[group_col].drop_duplicates() if len(group_vals) > MAX_GROUPS: dtypes = get_dtypes(group_vals) group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } group_f, _ = build_formatters(group_vals) group_vals = group_f.format_lists(group_vals) group_vals = pd.DataFrame(group_vals, columns=group_col) msg = ( "Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. " 'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) ' "are listed below:\n\n{}").format( ", ".join(group_col), MAX_GROUPS, group_vals.to_string(index=False)) raise ChartBuildingError(msg, group_vals.to_string(index=False)) data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") data_f, range_f = build_formatters(data) ret_data = dict( data={}, min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, ) dtypes = get_dtypes(data) group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } def _load_groups(df): for group_val, grp in df.groupby(group_col): def _group_filter(): for gv, gc in zip(make_list(group_val), group_col): classifier = classify_type(dtypes[gc]) yield group_filter_handler( gc, group_fmts[gc](gv, as_string=True), classifier) group_filter = " and ".join(list(_group_filter())) yield group_filter, data_f.format_lists(grp) if animate_by is not None: frame_fmt = find_dtype_formatter(dtypes[animate_by], overrides=group_fmt_overrides) ret_data["frames"] = [] for frame_key, frame in data.sort_values(animate_by).groupby( animate_by): ret_data["frames"].append( dict( data=dict(_load_groups(frame)), name=frame_fmt(frame_key, as_string=True), )) ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"]) else: ret_data["data"] = dict(_load_groups(data)) return ret_data, code main_group = [x] if animate_by is not None: main_group = [animate_by] + main_group sort_cols = main_group + sort_cols data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data) y_cols = [str(y_col) for y_col in y_cols] data = data[main_group + y_cols + z_cols] data = data.rename(columns={x: x_col}) main_group = [x_col if c == x else c for c in main_group] code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None: data, agg_code = build_agg_data(data, x_col, y_cols, kwargs, agg, z=z_col, animate_by=animate_by) code += agg_code data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") dupe_cols = main_group + (y_cols if len(z_cols) else []) check_exceptions( data[dupe_cols].rename(columns={x_col: x}), allow_duplicates or agg in ["raw", "drop_duplicates"], unlimited_data=unlimited_data, data_limit=40000 if len(z_cols) or animate_by is not None else 15000, ) data_f, range_f = build_formatters(data) ret_data = dict( min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, ) if animate_by is not None: frame_fmt = find_dtype_formatter(find_dtype(data[animate_by]), overrides=group_fmt_overrides) ret_data["frames"] = [] for frame_key, frame in data.sort_values(animate_by).groupby( animate_by): ret_data["frames"].append( dict( data={str("all"): data_f.format_lists(frame)}, name=frame_fmt(frame_key, as_string=True), )) ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"]) else: ret_data["data"] = {str("all"): data_f.format_lists(data)} return ret_data, code
def build_base_chart(raw_data, x, y, group_col=None, group_type=None, group_val=None, bins_val=None, bin_type=None, agg=None, extended_aggregation=[], allow_duplicates=False, return_raw=False, unlimited_data=False, animate_by=None, cleaners=[], **kwargs): """ Helper function to return data for 'chart-data' & 'correlations-ts' endpoints. Will return a dictionary of dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum & maximum of all the series for the y-axis. If there is only one series (no group_col specified) the only key in the dictionary of series data will be 'all' otherwise the keys will be the values of the groups. :param raw_data: dataframe to be used for chart :type raw_data: :class:`pandas:pandas.DataFrame` :param x: column to be used as x-axis of chart :type x: str :param y: column to be used as y-axis of chart :type y: list of strings :param group: comma-separated string of columns to group chart data by :type group: str, optional :param agg: points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: str, optional :param extended_aggregation: list of configurations that point to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: list, optional :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots) :type allow_duplicates: bool, optional :return: dict """ group_fmt_overrides = { "I": lambda v, as_string: json_int(v, as_string=as_string, fmt="{}") } data, code = retrieve_chart_data(raw_data, x, y, kwargs.get("z"), group_col, animate_by, group_val=group_val) cleaners = cleaners or [] if len(cleaners): for col in data.columns: if classify_type(find_dtype(data[col])) == "S": code.append("s = chart_data['{}']".format(col)) cleaned_col, cleaned_code = handle_cleaners( data[col], ",".join(cleaners)) data.loc[:, col] = cleaned_col code += cleaned_code code.append("chart_data.loc[:, '{}'] = s".format(col)) x_col = str("x") if x is None: x = x_col data.loc[:, x_col] = range(1, len(data) + 1) # sequential integers: 1, 2, ..., N y_cols = make_list(y) z_col = kwargs.get("z") z_cols = make_list(z_col) y_cols = [str(col) for col in y_cols] is_z = len(z_cols) > 0 y_group_cols = y_cols if is_z else [] sort_cols = y_group_cols final_cols = y_cols + z_cols if group_col is not None and len(group_col): for col in make_list(group_col): classifier = classify_type(find_dtype(data[col])) if classifier == "F" or (classifier == "I" and group_type == "bins"): if bin_type == "width": data.loc[:, col] = pd.qcut(data[col], q=bins_val, duplicates="drop").astype("str") code.append(( "chart_data.loc[:, '{col}'] = pd.qcut(chart_data['{col}'], q={bins}, duplicates=\"drop\")" ).format(col=col, bins=bins_val)) else: bins_data = data[col].dropna() npt = len(bins_data) equal_freq_bins = np.interp( np.linspace(0, npt, bins_val + 1), np.arange(npt), np.sort(bins_data), ) data.loc[:, col] = pd.cut(data[col], bins=equal_freq_bins, duplicates="drop").astype("str") code.append(( "bins_data = data['{col}'].dropna()\n" "npt = len(bins_data)\n" "equal_freq_bins = np.interp(np.linspace(0, npt, {bins}), np.arange(npt), " "np.sort(bins_data))\n" "chart_data.loc[:, '{col}'] = pd.cut(chart_data['{col}'], bins=equal_freq_bins, " 'duplicates="drop")').format(col=col, bins=bins_val + 1)) main_group = group_col if animate_by is not None: main_group = [animate_by] + main_group sort_cols = main_group + [x] + sort_cols data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data) data = data.rename(columns={x: x_col}) code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None or len(extended_aggregation): data, agg_code, final_cols = build_agg_data( data, x_col, y_cols, kwargs, agg, z=z_col, group_col=group_col, animate_by=animate_by, extended_aggregation=extended_aggregation, ) code += agg_code group_vals = data[group_col].drop_duplicates() if len(group_vals) > MAX_GROUPS: dtypes = get_dtypes(group_vals) group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } group_f, _ = build_formatters(group_vals) group_vals = group_f.format_lists(group_vals) group_vals = pd.DataFrame(group_vals, columns=group_col) msg = ( "Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. " 'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) ' "are listed below:\n\n{}").format( ", ".join(group_col), MAX_GROUPS, group_vals.to_string(index=False)) raise ChartBuildingError(msg, group_vals.to_string(index=False)) data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") data_f, range_f = build_formatters(data) ret_data = dict( data={}, min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + final_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + final_cols }, ) dtypes = get_dtypes(data) group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } def _load_groups(df): for group_val, grp in df.groupby(group_col): def _group_filter(): for gv, gc in zip(make_list(group_val), group_col): classifier = classify_type(dtypes[gc]) yield group_filter_handler( gc, group_fmts[gc](gv, as_string=True), classifier) final_group_filter, final_group_label = [], [] for gf, gl in _group_filter(): final_group_filter.append(gf) final_group_label.append(gl) group_filter = " and ".join(final_group_filter) group_label = "({})".format(", ".join(final_group_label)) data = data_f.format_lists(grp) data["_filter_"] = group_filter yield group_label, data if animate_by is not None: frame_fmt = find_dtype_formatter(dtypes[animate_by], overrides=group_fmt_overrides) ret_data["frames"] = [] for frame_key, frame in data.sort_values(animate_by).groupby( animate_by): ret_data["frames"].append( dict( data=dict(_load_groups(frame)), name=frame_fmt(frame_key, as_string=True), )) ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"]) else: ret_data["data"] = dict(_load_groups(data)) return ret_data, code main_group = [x] if animate_by is not None: main_group = [animate_by] + main_group sort_cols = main_group + sort_cols data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data) data = data[main_group + final_cols] data = data.rename(columns={x: x_col}) main_group = [x_col if c == x else c for c in main_group] code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") # convert booleans into integers for aggregation for col in z_cols or y_cols: classifier = classify_type(find_dtype(data[col])) if classifier == "B": data.loc[:, col] = data[col].astype("int") if agg is not None or len(extended_aggregation): data, agg_code, final_cols = build_agg_data( data, x_col, y_cols, kwargs, agg, z=z_col, animate_by=animate_by, extended_aggregation=extended_aggregation, ) code += agg_code data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") dupe_cols = main_group + y_group_cols data_limit = global_state.get_chart_settings( )["3d_points" if is_z or animate_by is not None else "scatter_points"] check_exceptions( data[dupe_cols].rename(columns={x_col: x}), allow_duplicates or agg in ["raw", "drop_duplicates"], unlimited_data=unlimited_data, data_limit=data_limit, ) data_f, range_f = build_formatters(data) ret_data = dict( min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_group_cols + final_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_group_cols + final_cols }, ) if animate_by is not None: frame_fmt = find_dtype_formatter(find_dtype(data[animate_by]), overrides=group_fmt_overrides) ret_data["frames"] = [] for frame_key, frame in data.sort_values(animate_by).groupby( animate_by): ret_data["frames"].append( dict( data={str("all"): data_f.format_lists(frame)}, name=frame_fmt(frame_key, as_string=True), )) ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"]) else: ret_data["data"] = {str("all"): data_f.format_lists(data)} return ret_data, code
def build_base_chart(raw_data, x, y, group_col=None, group_val=None, agg=None, allow_duplicates=False, return_raw=False, unlimited_data=False, **kwargs): """ Helper function to return data for 'chart-data' & 'correlations-ts' endpoints. Will return a dictionary of dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum & maximum of all the series for the y-axis. If there is only one series (no group_col specified) the only key in the dictionary of series data will be 'all' otherwise the keys will be the values of the groups. :param raw_data: dataframe to be used for chart :type raw_data: :class:`pandas:pandas.DataFrame` :param x: column to be used as x-axis of chart :type x: str :param y: column to be used as y-axis of chart :type y: list of strings :param group: comma-separated string of columns to group chart data by :type group: str, optional :param agg: points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: str, optional :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots) :type allow_duplicates: bool, optional :return: dict """ data, code = retrieve_chart_data(raw_data, x, y, kwargs.get('z'), group_col, group_val=group_val) x_col = str('x') y_cols = make_list(y) z_col = kwargs.get('z') z_cols = make_list(z_col) if group_col is not None and len(group_col): data = data.sort_values(group_col + [x]) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(group_col + [x]))) check_all_nan(data, [x] + y_cols) data = data.rename(columns={x: x_col}) code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None and agg != 'raw': data = data.groupby(group_col + [x_col]) data = getattr(data, agg)().reset_index() code.append( "chart_data = chart_data.groupby(['{cols}']).{agg}().reset_index()" .format(cols="', '".join(group_col + [x]), agg=agg)) MAX_GROUPS = 30 group_vals = data[group_col].drop_duplicates() if len(group_vals) > MAX_GROUPS: dtypes = get_dtypes(group_vals) group_fmt_overrides = { 'I': lambda v, as_string: json_int(v, as_string=as_string, fmt='{}') } group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } group_f, _ = build_formatters(group_vals) group_vals = group_f.format_lists(group_vals) group_vals = pd.DataFrame(group_vals, columns=group_col) msg = ( 'Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. ' 'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) ' 'are listed below:').format(', '.join(group_col), MAX_GROUPS, group_vals.to_string(index=False)) raise ChartBuildingError(msg, group_vals.to_string(index=False)) data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") data_f, range_f = build_formatters(data) ret_data = dict( data={}, min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols }, ) dtypes = get_dtypes(data) group_fmt_overrides = { 'I': lambda v, as_string: json_int(v, as_string=as_string, fmt='{}') } group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } for group_val, grp in data.groupby(group_col): def _group_filter(): for gv, gc in zip(make_list(group_val), group_col): classifier = classify_type(dtypes[gc]) yield group_filter_handler( gc, group_fmts[gc](gv, as_string=True), classifier) group_filter = ' and '.join(list(_group_filter())) ret_data['data'][group_filter] = data_f.format_lists(grp) return ret_data, code sort_cols = [x] + (y_cols if len(z_cols) else []) data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data, [x] + y_cols + z_cols) y_cols = [str(y_col) for y_col in y_cols] data.columns = [x_col] + y_cols + z_cols code.append("chart_data.columns = ['{cols}']".format( cols="', '".join([x_col] + y_cols + z_cols))) if agg is not None: data, agg_code = build_agg_data(data, x_col, y_cols, kwargs, agg, z=z_col) code += agg_code data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") dupe_cols = [x_col] + (y_cols if len(z_cols) else []) check_exceptions(data[dupe_cols].rename(columns={'x': x}), allow_duplicates or agg == 'raw', unlimited_data=unlimited_data, data_limit=40000 if len(z_cols) else 15000) data_f, range_f = build_formatters(data) ret_data = dict(data={str('all'): data_f.format_lists(data)}, min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }) return ret_data, code