def describe(column): """ Flask route which returns standard details about column data using :meth:`pandas:pandas.DataFrame.describe` to the front-end as JSON :param column: required dash separated string "START-END" stating a range of row indexes to be returned to the screen :return: JSON { describe: object representing output from :meth:`pandas:pandas.Series.describe`, unique_data: array of unique values when data has <= 100 unique values success: True/False } """ try: data = DATA[get_port()] desc = load_describe(data[column]) return_data = dict(describe=desc, success=True) uniq_vals = data[column].unique() if 'unique' not in return_data['describe']: return_data['describe']['unique'] = json_int(len(uniq_vals), as_string=True) if len(uniq_vals) <= 100: uniq_f = find_dtype_formatter(get_dtypes(data)[column]) return_data['uniques'] = [ uniq_f(u, nan_display='N/A') for u in uniq_vals ] return jsonify(return_data) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def build(self, parent): if parent.classifier == "D": parent.data.loc[:, parent.selected_col] = apply( parent.data[parent.selected_col], json_timestamp) kde_code = [] if self.target is None: return_data, hist_labels = self.build_histogram_data( parent.data[parent.selected_col]) kde, kde_code = build_kde(parent.data[parent.selected_col], hist_labels, parent.selected_col) if kde is not None: return_data["kde"] = kde else: return_data = {"targets": [], "labels": list(range(self.bins))} target_dtype = find_dtype(parent.data[self.target]) target_formatter = find_dtype_formatter(target_dtype) for target, target_data in parent.data[[ self.target, parent.selected_col ]].groupby(self.target): target_data, _ = self.build_histogram_data( target_data[parent.selected_col]) target_data["target"] = target_formatter(target, as_string=True) return_data["targets"].append(target_data) desc, desc_code = load_describe(parent.data[parent.selected_col]) dtype_info = global_state.get_dtype_info(parent.data_id, parent.selected_col) for p in ["skew", "kurt"]: if p in dtype_info: desc[p] = dtype_info[p] return_data["desc"] = desc return return_data, self._build_code(parent, kde_code, desc_code)
def describe(data_id, column): """ :class:`flask:flask.Flask` route which returns standard details about column data using :meth:`pandas:pandas.DataFrame.describe` to the front-end as JSON :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param column: required dash separated string "START-END" stating a range of row indexes to be returned to the screen :return: JSON { describe: object representing output from :meth:`pandas:pandas.Series.describe`, unique_data: array of unique values when data has <= 100 unique values success: True/False } """ try: data = DATA[data_id][[column]] additional_aggs = None dtype = next((dtype_info['dtype'] for dtype_info in DTYPES[data_id] if dtype_info['name'] == column), None) if classify_type(dtype) in ['I', 'F']: additional_aggs = [ 'sum', 'median', 'mode', 'var', 'sem', 'skew', 'kurt' ] desc = load_describe(data[column], additional_aggs=additional_aggs) return_data = dict(describe=desc, success=True) uniq_vals = data[column].unique() if 'unique' not in return_data['describe']: return_data['describe']['unique'] = json_int(len(uniq_vals), as_string=True) uniq_f = find_dtype_formatter(get_dtypes(data)[column]) if len(uniq_vals) <= 100: return_data['uniques'] = dict(data=[uniq_f(u) for u in uniq_vals], top=False) else: # get top 100 most common values uniq_vals = data[column].value_counts().sort_values( ascending=False).head(100).index.values return_data['uniques'] = dict(data=[uniq_f(u) for u in uniq_vals], top=True) return jsonify(return_data) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def build(self, parent): if parent.classifier == "D": parent.data.loc[:, parent.selected_col] = apply( parent.data[parent.selected_col], json_timestamp ) kde_code = [] if self.target is None: return_data, hist_labels = self.build_histogram_data( parent.data[parent.selected_col] ) kde, kde_code = build_kde( parent.data[parent.selected_col], hist_labels, parent.selected_col ) if kde is not None: return_data["kde"] = kde else: bin_vals = pd.cut(parent.data[parent.selected_col], bins=self.bins) labels = ["{}".format(c) for c in bin_vals.dtype.categories] parent.data.loc[:, "bin"] = bin_vals.astype("str") return_data = {"targets": [], "labels": labels} target_dtype = find_dtype(parent.data[self.target]) target_formatter = find_dtype_formatter(target_dtype) for target, target_data in parent.data[[self.target, "bin"]].groupby( self.target ): target_counts = target_data["bin"].value_counts() target_counts = [ int(tc) for tc in target_counts.reindex(labels, fill_value=0).values ] return_data["targets"].append( dict( target=target_formatter(target, as_string=True), data=target_counts, ) ) desc, desc_code = load_describe(parent.data[parent.selected_col]) dtype_info = global_state.get_dtype_info(parent.data_id, parent.selected_col) for p in ["skew", "kurt"]: if p in dtype_info: desc[p] = dtype_info[p] return_data["desc"] = desc return return_data, self._build_code(parent, kde_code, desc_code)
def build_base_chart(raw_data, x, y, group_col=None, group_val=None, agg=None, allow_duplicates=False, return_raw=False, unlimited_data=False, animate_by=None, **kwargs): """ Helper function to return data for 'chart-data' & 'correlations-ts' endpoints. Will return a dictionary of dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum & maximum of all the series for the y-axis. If there is only one series (no group_col specified) the only key in the dictionary of series data will be 'all' otherwise the keys will be the values of the groups. :param raw_data: dataframe to be used for chart :type raw_data: :class:`pandas:pandas.DataFrame` :param x: column to be used as x-axis of chart :type x: str :param y: column to be used as y-axis of chart :type y: list of strings :param group: comma-separated string of columns to group chart data by :type group: str, optional :param agg: points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: str, optional :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots) :type allow_duplicates: bool, optional :return: dict """ group_fmt_overrides = { "I": lambda v, as_string: json_int(v, as_string=as_string, fmt="{}") } data, code = retrieve_chart_data(raw_data, x, y, kwargs.get("z"), group_col, animate_by, group_val=group_val) x_col = str("x") if x is None: x = x_col data.loc[:, x_col] = range(1, len(data) + 1) # sequential integers: 1, 2, ..., N y_cols = make_list(y) z_col = kwargs.get("z") z_cols = make_list(z_col) sort_cols = y_cols if len(z_cols) else [] if group_col is not None and len(group_col): main_group = group_col if animate_by is not None: main_group = [animate_by] + main_group sort_cols = main_group + [x] + sort_cols data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data) data = data.rename(columns={x: x_col}) code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None: data, agg_code = build_agg_data( data, x_col, y_cols, kwargs, agg, z=z_col, group_col=group_col, animate_by=animate_by, ) code += agg_code MAX_GROUPS = 30 group_vals = data[group_col].drop_duplicates() if len(group_vals) > MAX_GROUPS: dtypes = get_dtypes(group_vals) group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } group_f, _ = build_formatters(group_vals) group_vals = group_f.format_lists(group_vals) group_vals = pd.DataFrame(group_vals, columns=group_col) msg = ( "Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. " 'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) ' "are listed below:\n\n{}").format( ", ".join(group_col), MAX_GROUPS, group_vals.to_string(index=False)) raise ChartBuildingError(msg, group_vals.to_string(index=False)) data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") data_f, range_f = build_formatters(data) ret_data = dict( data={}, min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, ) dtypes = get_dtypes(data) group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } def _load_groups(df): for group_val, grp in df.groupby(group_col): def _group_filter(): for gv, gc in zip(make_list(group_val), group_col): classifier = classify_type(dtypes[gc]) yield group_filter_handler( gc, group_fmts[gc](gv, as_string=True), classifier) group_filter = " and ".join(list(_group_filter())) yield group_filter, data_f.format_lists(grp) if animate_by is not None: frame_fmt = find_dtype_formatter(dtypes[animate_by], overrides=group_fmt_overrides) ret_data["frames"] = [] for frame_key, frame in data.sort_values(animate_by).groupby( animate_by): ret_data["frames"].append( dict( data=dict(_load_groups(frame)), name=frame_fmt(frame_key, as_string=True), )) ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"]) else: ret_data["data"] = dict(_load_groups(data)) return ret_data, code main_group = [x] if animate_by is not None: main_group = [animate_by] + main_group sort_cols = main_group + sort_cols data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data) y_cols = [str(y_col) for y_col in y_cols] data = data[main_group + y_cols + z_cols] data = data.rename(columns={x: x_col}) main_group = [x_col if c == x else c for c in main_group] code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None: data, agg_code = build_agg_data(data, x_col, y_cols, kwargs, agg, z=z_col, animate_by=animate_by) code += agg_code data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") dupe_cols = main_group + (y_cols if len(z_cols) else []) check_exceptions( data[dupe_cols].rename(columns={x_col: x}), allow_duplicates or agg in ["raw", "drop_duplicates"], unlimited_data=unlimited_data, data_limit=40000 if len(z_cols) or animate_by is not None else 15000, ) data_f, range_f = build_formatters(data) ret_data = dict( min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, ) if animate_by is not None: frame_fmt = find_dtype_formatter(find_dtype(data[animate_by]), overrides=group_fmt_overrides) ret_data["frames"] = [] for frame_key, frame in data.sort_values(animate_by).groupby( animate_by): ret_data["frames"].append( dict( data={str("all"): data_f.format_lists(frame)}, name=frame_fmt(frame_key, as_string=True), )) ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"]) else: ret_data["data"] = {str("all"): data_f.format_lists(data)} return ret_data, code
def build_base_chart(raw_data, x, y, group_col=None, group_type=None, group_val=None, bins_val=None, bin_type=None, agg=None, extended_aggregation=[], allow_duplicates=False, return_raw=False, unlimited_data=False, animate_by=None, cleaners=[], **kwargs): """ Helper function to return data for 'chart-data' & 'correlations-ts' endpoints. Will return a dictionary of dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum & maximum of all the series for the y-axis. If there is only one series (no group_col specified) the only key in the dictionary of series data will be 'all' otherwise the keys will be the values of the groups. :param raw_data: dataframe to be used for chart :type raw_data: :class:`pandas:pandas.DataFrame` :param x: column to be used as x-axis of chart :type x: str :param y: column to be used as y-axis of chart :type y: list of strings :param group: comma-separated string of columns to group chart data by :type group: str, optional :param agg: points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: str, optional :param extended_aggregation: list of configurations that point to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: list, optional :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots) :type allow_duplicates: bool, optional :return: dict """ group_fmt_overrides = { "I": lambda v, as_string: json_int(v, as_string=as_string, fmt="{}") } data, code = retrieve_chart_data(raw_data, x, y, kwargs.get("z"), group_col, animate_by, group_val=group_val) cleaners = cleaners or [] if len(cleaners): for col in data.columns: if classify_type(find_dtype(data[col])) == "S": code.append("s = chart_data['{}']".format(col)) cleaned_col, cleaned_code = handle_cleaners( data[col], ",".join(cleaners)) data.loc[:, col] = cleaned_col code += cleaned_code code.append("chart_data.loc[:, '{}'] = s".format(col)) x_col = str("x") if x is None: x = x_col data.loc[:, x_col] = range(1, len(data) + 1) # sequential integers: 1, 2, ..., N y_cols = make_list(y) z_col = kwargs.get("z") z_cols = make_list(z_col) y_cols = [str(col) for col in y_cols] is_z = len(z_cols) > 0 y_group_cols = y_cols if is_z else [] sort_cols = y_group_cols final_cols = y_cols + z_cols if group_col is not None and len(group_col): for col in make_list(group_col): classifier = classify_type(find_dtype(data[col])) if classifier == "F" or (classifier == "I" and group_type == "bins"): if bin_type == "width": data.loc[:, col] = pd.qcut(data[col], q=bins_val, duplicates="drop").astype("str") code.append(( "chart_data.loc[:, '{col}'] = pd.qcut(chart_data['{col}'], q={bins}, duplicates=\"drop\")" ).format(col=col, bins=bins_val)) else: bins_data = data[col].dropna() npt = len(bins_data) equal_freq_bins = np.interp( np.linspace(0, npt, bins_val + 1), np.arange(npt), np.sort(bins_data), ) data.loc[:, col] = pd.cut(data[col], bins=equal_freq_bins, duplicates="drop").astype("str") code.append(( "bins_data = data['{col}'].dropna()\n" "npt = len(bins_data)\n" "equal_freq_bins = np.interp(np.linspace(0, npt, {bins}), np.arange(npt), " "np.sort(bins_data))\n" "chart_data.loc[:, '{col}'] = pd.cut(chart_data['{col}'], bins=equal_freq_bins, " 'duplicates="drop")').format(col=col, bins=bins_val + 1)) main_group = group_col if animate_by is not None: main_group = [animate_by] + main_group sort_cols = main_group + [x] + sort_cols data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data) data = data.rename(columns={x: x_col}) code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None or len(extended_aggregation): data, agg_code, final_cols = build_agg_data( data, x_col, y_cols, kwargs, agg, z=z_col, group_col=group_col, animate_by=animate_by, extended_aggregation=extended_aggregation, ) code += agg_code group_vals = data[group_col].drop_duplicates() if len(group_vals) > MAX_GROUPS: dtypes = get_dtypes(group_vals) group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } group_f, _ = build_formatters(group_vals) group_vals = group_f.format_lists(group_vals) group_vals = pd.DataFrame(group_vals, columns=group_col) msg = ( "Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. " 'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) ' "are listed below:\n\n{}").format( ", ".join(group_col), MAX_GROUPS, group_vals.to_string(index=False)) raise ChartBuildingError(msg, group_vals.to_string(index=False)) data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") data_f, range_f = build_formatters(data) ret_data = dict( data={}, min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + final_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + final_cols }, ) dtypes = get_dtypes(data) group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } def _load_groups(df): for group_val, grp in df.groupby(group_col): def _group_filter(): for gv, gc in zip(make_list(group_val), group_col): classifier = classify_type(dtypes[gc]) yield group_filter_handler( gc, group_fmts[gc](gv, as_string=True), classifier) final_group_filter, final_group_label = [], [] for gf, gl in _group_filter(): final_group_filter.append(gf) final_group_label.append(gl) group_filter = " and ".join(final_group_filter) group_label = "({})".format(", ".join(final_group_label)) data = data_f.format_lists(grp) data["_filter_"] = group_filter yield group_label, data if animate_by is not None: frame_fmt = find_dtype_formatter(dtypes[animate_by], overrides=group_fmt_overrides) ret_data["frames"] = [] for frame_key, frame in data.sort_values(animate_by).groupby( animate_by): ret_data["frames"].append( dict( data=dict(_load_groups(frame)), name=frame_fmt(frame_key, as_string=True), )) ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"]) else: ret_data["data"] = dict(_load_groups(data)) return ret_data, code main_group = [x] if animate_by is not None: main_group = [animate_by] + main_group sort_cols = main_group + sort_cols data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data) data = data[main_group + final_cols] data = data.rename(columns={x: x_col}) main_group = [x_col if c == x else c for c in main_group] code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") # convert booleans into integers for aggregation for col in z_cols or y_cols: classifier = classify_type(find_dtype(data[col])) if classifier == "B": data.loc[:, col] = data[col].astype("int") if agg is not None or len(extended_aggregation): data, agg_code, final_cols = build_agg_data( data, x_col, y_cols, kwargs, agg, z=z_col, animate_by=animate_by, extended_aggregation=extended_aggregation, ) code += agg_code data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") dupe_cols = main_group + y_group_cols data_limit = global_state.get_chart_settings( )["3d_points" if is_z or animate_by is not None else "scatter_points"] check_exceptions( data[dupe_cols].rename(columns={x_col: x}), allow_duplicates or agg in ["raw", "drop_duplicates"], unlimited_data=unlimited_data, data_limit=data_limit, ) data_f, range_f = build_formatters(data) ret_data = dict( min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_group_cols + final_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_group_cols + final_cols }, ) if animate_by is not None: frame_fmt = find_dtype_formatter(find_dtype(data[animate_by]), overrides=group_fmt_overrides) ret_data["frames"] = [] for frame_key, frame in data.sort_values(animate_by).groupby( animate_by): ret_data["frames"].append( dict( data={str("all"): data_f.format_lists(frame)}, name=frame_fmt(frame_key, as_string=True), )) ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"]) else: ret_data["data"] = {str("all"): data_f.format_lists(data)} return ret_data, code
def build_chart(raw_data, x, y, group_col=None, agg=None, allow_duplicates=False, **kwargs): """ Helper function to return data for 'chart-data' & 'correlations-ts' endpoints. Will return a dictionary of dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum & maximum of all the series for the y-axis. If there is only one series (no group_col specified) the only key in the dictionary of series data will be 'all' otherwise the keys will be the values of the groups. :param raw_data: dataframe to be used for chart :type raw_data: :class:`pandas:pandas.DataFrame` :param x: column to be used as x-axis of chart :type x: str :param y: column to be used as y-axis of chart :type y: list of strings :param group: comma-separated string of columns to group chart data by :type group: str, optional :param agg: points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: str, optional :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots) :type allow_duplicates: bool, optional :return: dict """ data, code = retrieve_chart_data(raw_data, x, y, kwargs.get('z'), group_col) x_col = str('x') y_cols = make_list(y) z_col = kwargs.get('z') z_cols = [] if z_col is not None: z_cols = [z_col] if group_col is not None and len(group_col): data = data.sort_values(group_col + [x]) code.append("chart_data = chart_data.sort_values(['{cols}'])".format(cols="', '".join(group_col + [x]))) check_all_nan(data, [x] + y_cols) data = data.rename(columns={x: x_col}) code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None: data = data.groupby(group_col + [x_col]) data = getattr(data, agg)().reset_index() code.append("chart_data = chart_data.groupby(['{cols}']).{agg}().reset_index()".format( cols="', '".join(group_col + [x]), agg=agg )) max_groups = 15 if len(data[group_col].drop_duplicates()) > max_groups: msg = ( 'Group ({}) contains more than {} unique values, please add additional filter' ' or else chart will be unreadable' ).format(', '.join(group_col), max_groups) raise Exception(msg) data = data.dropna() code.append("chart_data = chart_data.dropna()") data_f, range_f = build_formatters(data) ret_data = dict( data={}, min={col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols}, max={col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols}, ) dtypes = get_dtypes(data) group_fmt_overrides = {'I': lambda v, as_string: json_int(v, as_string=as_string, fmt='{}')} group_fmts = {c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col} for group_val, grp in data.groupby(group_col): group_val = '/'.join([ group_fmts[gc](gv, as_string=True) for gv, gc in zip(make_list(group_val), group_col) ]) ret_data['data'][group_val] = data_f.format_lists(grp) ret_data['dtypes'] = {c: classify_type(dtype) for c, dtype in dtypes.items()} return ret_data, code sort_cols = [x] + (y_cols if len(z_cols) else []) data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format(cols="', '".join(sort_cols))) check_all_nan(data, [x] + y_cols + z_cols) y_cols = [str(y_col) for y_col in y_cols] data.columns = [x_col] + y_cols + z_cols code.append("chart_data.columns = ['{cols}']".format(cols="', '".join([x_col] + y_cols + z_cols))) if agg is not None: data, agg_code = build_agg_data(data, x_col, y_cols, kwargs, agg, z=z_col) code += agg_code data = data.dropna() code.append("chart_data = chart_data.dropna()") dupe_cols = [x_col] + (y_cols if len(z_cols) else []) check_exceptions(data[dupe_cols].rename(columns={'x': x}), allow_duplicates, data_limit=40000 if len(z_cols) else 15000) data_f, range_f = build_formatters(data) ret_data = dict( data={str('all'): data_f.format_lists(data)}, min={col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols}, max={col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols} ) return ret_data, code
def build_chart(data, x, y, group_col=None, agg=None, allow_duplicates=False, **kwargs): """ Helper function to return data for 'chart-data' & 'correlations-ts' endpoints. Will return a dictionary of dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum & maximum of all the series for the y-axis. If there is only one series (no group_col specified) the only key in the dictionary of series data will be 'all' otherwise the keys will be the values of the groups. :param data: dataframe to be used for chart :type data: :class:`pandas:pandas.DataFrame` :param x: column to be used as x-axis of chart :type x: str :param y: column to be used as y-axis of chart :type y: list of strings :param group: comma-separated string of columns to group chart data by :type group: str, optional :param aggregation: points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type aggregation: str, optional :return: dict """ def build_formatters(df): cols = grid_columns(df) data_f = grid_formatter(cols, nan_display=None) overrides = {'F': lambda f, i, c: f.add_float(i, c, precision=2)} range_f = grid_formatter(cols, overrides=overrides, nan_display=None) return data_f, range_f def check_all_nan(df, cols): for col in cols: if df[col].isnull().all(): raise Exception('All data for column "{}" is NaN!'.format(col)) x_col = str('x') y_cols = make_list(y) if group_col is not None: data = data[group_col + [x] + y_cols].sort_values(group_col + [x]) check_all_nan(data, [x] + y_cols) y_cols = [str(y_col) for y_col in y_cols] data.columns = group_col + [x_col] + y_cols if agg is not None: data = data.groupby(group_col + [x_col]) data = getattr(data, agg)().reset_index() max_groups = 15 if len(data[group_col].drop_duplicates()) > max_groups: msg = ( 'Group ({}) contains more than {} unique values, please add additional filter' ' or else chart will be unreadable' ).format(', '.join(group_col), max_groups) raise Exception(msg) data_f, range_f = build_formatters(data[[x_col] + y_cols]) ret_data = dict( data={}, min={col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols}, max={col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols}, ) dtypes = get_dtypes(data) group_fmts = {c: find_dtype_formatter(dtypes[c]) for c in group_col} for group_val, grp in data.groupby(group_col): group_val = '/'.join([ group_fmts[gc](gv, as_string=True) for gv, gc in zip(make_list(group_val), group_col) ]) ret_data['data'][group_val] = data_f.format_lists(grp) return ret_data data = data[[x] + y_cols].sort_values(x) check_all_nan(data, [x] + y_cols) y_cols = [str(y_col) for y_col in y_cols] data.columns = [x_col] + y_cols if agg is not None: if agg == 'rolling': window, comp = map(kwargs.get, ['rolling_win', 'rolling_comp']) data = data.set_index(x_col).rolling(window=window) data = pd.DataFrame({c: getattr(data[c], comp)() for c in y_cols}) data = data.reset_index() else: data = data.groupby(x_col) data = getattr(data[y_cols], agg)().reset_index() if not allow_duplicates and any(data[x_col].duplicated()): raise Exception('{} contains duplicates, please specify group or additional filtering'.format(x)) if len(data) > 15000: raise Exception('Dataset exceeds 15,000 records, cannot render. Please apply filter...') data_f, range_f = build_formatters(data) ret_data = dict( data={str('all'): data_f.format_lists(data)}, min={col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols}, max={col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols}, ) return ret_data
def build_base_chart(raw_data, x, y, group_col=None, group_val=None, agg=None, allow_duplicates=False, return_raw=False, unlimited_data=False, **kwargs): """ Helper function to return data for 'chart-data' & 'correlations-ts' endpoints. Will return a dictionary of dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum & maximum of all the series for the y-axis. If there is only one series (no group_col specified) the only key in the dictionary of series data will be 'all' otherwise the keys will be the values of the groups. :param raw_data: dataframe to be used for chart :type raw_data: :class:`pandas:pandas.DataFrame` :param x: column to be used as x-axis of chart :type x: str :param y: column to be used as y-axis of chart :type y: list of strings :param group: comma-separated string of columns to group chart data by :type group: str, optional :param agg: points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: str, optional :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots) :type allow_duplicates: bool, optional :return: dict """ data, code = retrieve_chart_data(raw_data, x, y, kwargs.get('z'), group_col, group_val=group_val) x_col = str('x') y_cols = make_list(y) z_col = kwargs.get('z') z_cols = make_list(z_col) if group_col is not None and len(group_col): data = data.sort_values(group_col + [x]) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(group_col + [x]))) check_all_nan(data, [x] + y_cols) data = data.rename(columns={x: x_col}) code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None and agg != 'raw': data = data.groupby(group_col + [x_col]) data = getattr(data, agg)().reset_index() code.append( "chart_data = chart_data.groupby(['{cols}']).{agg}().reset_index()" .format(cols="', '".join(group_col + [x]), agg=agg)) MAX_GROUPS = 30 group_vals = data[group_col].drop_duplicates() if len(group_vals) > MAX_GROUPS: dtypes = get_dtypes(group_vals) group_fmt_overrides = { 'I': lambda v, as_string: json_int(v, as_string=as_string, fmt='{}') } group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } group_f, _ = build_formatters(group_vals) group_vals = group_f.format_lists(group_vals) group_vals = pd.DataFrame(group_vals, columns=group_col) msg = ( 'Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. ' 'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) ' 'are listed below:').format(', '.join(group_col), MAX_GROUPS, group_vals.to_string(index=False)) raise ChartBuildingError(msg, group_vals.to_string(index=False)) data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") data_f, range_f = build_formatters(data) ret_data = dict( data={}, min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols }, ) dtypes = get_dtypes(data) group_fmt_overrides = { 'I': lambda v, as_string: json_int(v, as_string=as_string, fmt='{}') } group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } for group_val, grp in data.groupby(group_col): def _group_filter(): for gv, gc in zip(make_list(group_val), group_col): classifier = classify_type(dtypes[gc]) yield group_filter_handler( gc, group_fmts[gc](gv, as_string=True), classifier) group_filter = ' and '.join(list(_group_filter())) ret_data['data'][group_filter] = data_f.format_lists(grp) return ret_data, code sort_cols = [x] + (y_cols if len(z_cols) else []) data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data, [x] + y_cols + z_cols) y_cols = [str(y_col) for y_col in y_cols] data.columns = [x_col] + y_cols + z_cols code.append("chart_data.columns = ['{cols}']".format( cols="', '".join([x_col] + y_cols + z_cols))) if agg is not None: data, agg_code = build_agg_data(data, x_col, y_cols, kwargs, agg, z=z_col) code += agg_code data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") dupe_cols = [x_col] + (y_cols if len(z_cols) else []) check_exceptions(data[dupe_cols].rename(columns={'x': x}), allow_duplicates or agg == 'raw', unlimited_data=unlimited_data, data_limit=40000 if len(z_cols) else 15000) data_f, range_f = build_formatters(data) ret_data = dict(data={str('all'): data_f.format_lists(data)}, min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }) return ret_data, code