def build(self, parent): if parent.classifier == "D": parent.data.loc[:, parent.selected_col] = apply( parent.data[parent.selected_col], json_timestamp) kde_code = [] if self.target is None: return_data, hist_labels = self.build_histogram_data( parent.data[parent.selected_col]) kde, kde_code = build_kde(parent.data[parent.selected_col], hist_labels, parent.selected_col) if kde is not None: return_data["kde"] = kde else: return_data = {"targets": [], "labels": list(range(self.bins))} target_dtype = find_dtype(parent.data[self.target]) target_formatter = find_dtype_formatter(target_dtype) for target, target_data in parent.data[[ self.target, parent.selected_col ]].groupby(self.target): target_data, _ = self.build_histogram_data( target_data[parent.selected_col]) target_data["target"] = target_formatter(target, as_string=True) return_data["targets"].append(target_data) desc, desc_code = load_describe(parent.data[parent.selected_col]) dtype_info = global_state.get_dtype_info(parent.data_id, parent.selected_col) for p in ["skew", "kurt"]: if p in dtype_info: desc[p] = dtype_info[p] return_data["desc"] = desc return return_data, self._build_code(parent, kde_code, desc_code)
def __init__(self, data_id, column, cfg): self.data_id = data_id self.column = column s = global_state.get_data(data_id)[column] dtype = find_dtype(s) self.classification = classify_type(dtype) self.cfg = cfg if self.cfg is not None: self.cfg = json.loads(self.cfg) if self.cfg['type'] == 'string': self.builder = StringFilter(column, self.classification, self.cfg) if self.cfg['type'] in ['int', 'float']: self.builder = NumericFilter(column, self.classification, self.cfg) if self.cfg['type'] == 'date': self.builder = DateFilter(column, self.classification, self.cfg)
def __init__(self, data_id, column, cfg): self.data_id = data_id self.column = column s = global_state.get_data(data_id)[column] dtype = find_dtype(s) self.classification = classify_type(dtype) self.cfg = cfg if self.cfg is not None: self.cfg = json.loads(self.cfg) if self.cfg["type"] == "string": self.builder = StringFilter(column, self.classification, self.cfg) if self.cfg["type"] in ["int", "float"]: self.builder = NumericFilter(column, self.classification, self.cfg) if self.cfg["type"] == "date": self.builder = DateFilter(column, self.classification, self.cfg) if self.cfg["type"] == "outliers": self.builder = OutlierFilter(column, self.classification, self.cfg)
def build(self, parent): if parent.classifier == "D": parent.data.loc[:, parent.selected_col] = apply( parent.data[parent.selected_col], json_timestamp ) kde_code = [] if self.target is None: return_data, hist_labels = self.build_histogram_data( parent.data[parent.selected_col] ) kde, kde_code = build_kde( parent.data[parent.selected_col], hist_labels, parent.selected_col ) if kde is not None: return_data["kde"] = kde else: bin_vals = pd.cut(parent.data[parent.selected_col], bins=self.bins) labels = ["{}".format(c) for c in bin_vals.dtype.categories] parent.data.loc[:, "bin"] = bin_vals.astype("str") return_data = {"targets": [], "labels": labels} target_dtype = find_dtype(parent.data[self.target]) target_formatter = find_dtype_formatter(target_dtype) for target, target_data in parent.data[[self.target, "bin"]].groupby( self.target ): target_counts = target_data["bin"].value_counts() target_counts = [ int(tc) for tc in target_counts.reindex(labels, fill_value=0).values ] return_data["targets"].append( dict( target=target_formatter(target, as_string=True), data=target_counts, ) ) desc, desc_code = load_describe(parent.data[parent.selected_col]) dtype_info = global_state.get_dtype_info(parent.data_id, parent.selected_col) for p in ["skew", "kurt"]: if p in dtype_info: desc[p] = dtype_info[p] return_data["desc"] = desc return return_data, self._build_code(parent, kde_code, desc_code)
def __init__(self, data_id, req): self.data_id = data_id self.analysis_type = get_str_arg(req, "type") curr_settings = global_state.get_settings(data_id) or {} self.query = build_query(data_id, curr_settings.get("query")) data = load_filterable_data(data_id, req, query=self.query) self.selected_col = find_selected_column( data, get_str_arg(req, "col", "values") ) self.data = data[~pd.isnull(data[self.selected_col])] self.dtype = find_dtype(self.data[self.selected_col]) self.classifier = classify_type(self.dtype) self.code = build_code_export( data_id, imports="{}\n".format( "\n".join( [ "import numpy as np", "import pandas as pd", "import plotly.graph_objs as go", ] ) ), ) if self.analysis_type is None: self.analysis_type = ( "histogram" if self.classifier in ["F", "I", "D"] else "value_counts" ) if self.analysis_type == "geolocation": self.analysis = GeolocationAnalysis(req) elif self.analysis_type == "histogram": self.analysis = HistogramAnalysis(req) elif self.analysis_type == "categories": self.analysis = CategoryAnalysis(req) elif self.analysis_type == "value_counts": self.analysis = ValueCountAnalysis(req) elif self.analysis_type == "word_value_counts": self.analysis = WordValueCountAnalysis(req) elif self.analysis_type == "qq": self.analysis = QQAnalysis()
def _build_val(col, val): if classify_type(find_dtype(data[col])) == "D": return json_date(convert_date_val_to_date(val)) return val
def get_inner_replacement_value_as_str(val, series): if isinstance(val, string_types) and val.lower() == "nan": return "np.nan" if classify_type(find_dtype(series)) == "S": return "'{value}'".format(value=val) return val
def build_base_chart(raw_data, x, y, group_col=None, group_val=None, agg=None, allow_duplicates=False, return_raw=False, unlimited_data=False, animate_by=None, **kwargs): """ Helper function to return data for 'chart-data' & 'correlations-ts' endpoints. Will return a dictionary of dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum & maximum of all the series for the y-axis. If there is only one series (no group_col specified) the only key in the dictionary of series data will be 'all' otherwise the keys will be the values of the groups. :param raw_data: dataframe to be used for chart :type raw_data: :class:`pandas:pandas.DataFrame` :param x: column to be used as x-axis of chart :type x: str :param y: column to be used as y-axis of chart :type y: list of strings :param group: comma-separated string of columns to group chart data by :type group: str, optional :param agg: points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: str, optional :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots) :type allow_duplicates: bool, optional :return: dict """ group_fmt_overrides = { "I": lambda v, as_string: json_int(v, as_string=as_string, fmt="{}") } data, code = retrieve_chart_data(raw_data, x, y, kwargs.get("z"), group_col, animate_by, group_val=group_val) x_col = str("x") if x is None: x = x_col data.loc[:, x_col] = range(1, len(data) + 1) # sequential integers: 1, 2, ..., N y_cols = make_list(y) z_col = kwargs.get("z") z_cols = make_list(z_col) sort_cols = y_cols if len(z_cols) else [] if group_col is not None and len(group_col): main_group = group_col if animate_by is not None: main_group = [animate_by] + main_group sort_cols = main_group + [x] + sort_cols data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data) data = data.rename(columns={x: x_col}) code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None: data, agg_code = build_agg_data( data, x_col, y_cols, kwargs, agg, z=z_col, group_col=group_col, animate_by=animate_by, ) code += agg_code MAX_GROUPS = 30 group_vals = data[group_col].drop_duplicates() if len(group_vals) > MAX_GROUPS: dtypes = get_dtypes(group_vals) group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } group_f, _ = build_formatters(group_vals) group_vals = group_f.format_lists(group_vals) group_vals = pd.DataFrame(group_vals, columns=group_col) msg = ( "Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. " 'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) ' "are listed below:\n\n{}").format( ", ".join(group_col), MAX_GROUPS, group_vals.to_string(index=False)) raise ChartBuildingError(msg, group_vals.to_string(index=False)) data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") data_f, range_f = build_formatters(data) ret_data = dict( data={}, min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, ) dtypes = get_dtypes(data) group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } def _load_groups(df): for group_val, grp in df.groupby(group_col): def _group_filter(): for gv, gc in zip(make_list(group_val), group_col): classifier = classify_type(dtypes[gc]) yield group_filter_handler( gc, group_fmts[gc](gv, as_string=True), classifier) group_filter = " and ".join(list(_group_filter())) yield group_filter, data_f.format_lists(grp) if animate_by is not None: frame_fmt = find_dtype_formatter(dtypes[animate_by], overrides=group_fmt_overrides) ret_data["frames"] = [] for frame_key, frame in data.sort_values(animate_by).groupby( animate_by): ret_data["frames"].append( dict( data=dict(_load_groups(frame)), name=frame_fmt(frame_key, as_string=True), )) ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"]) else: ret_data["data"] = dict(_load_groups(data)) return ret_data, code main_group = [x] if animate_by is not None: main_group = [animate_by] + main_group sort_cols = main_group + sort_cols data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data) y_cols = [str(y_col) for y_col in y_cols] data = data[main_group + y_cols + z_cols] data = data.rename(columns={x: x_col}) main_group = [x_col if c == x else c for c in main_group] code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None: data, agg_code = build_agg_data(data, x_col, y_cols, kwargs, agg, z=z_col, animate_by=animate_by) code += agg_code data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") dupe_cols = main_group + (y_cols if len(z_cols) else []) check_exceptions( data[dupe_cols].rename(columns={x_col: x}), allow_duplicates or agg in ["raw", "drop_duplicates"], unlimited_data=unlimited_data, data_limit=40000 if len(z_cols) or animate_by is not None else 15000, ) data_f, range_f = build_formatters(data) ret_data = dict( min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_cols + z_cols }, ) if animate_by is not None: frame_fmt = find_dtype_formatter(find_dtype(data[animate_by]), overrides=group_fmt_overrides) ret_data["frames"] = [] for frame_key, frame in data.sort_values(animate_by).groupby( animate_by): ret_data["frames"].append( dict( data={str("all"): data_f.format_lists(frame)}, name=frame_fmt(frame_key, as_string=True), )) ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"]) else: ret_data["data"] = {str("all"): data_f.format_lists(data)} return ret_data, code
def build_base_chart(raw_data, x, y, group_col=None, group_type=None, group_val=None, bins_val=None, bin_type=None, agg=None, extended_aggregation=[], allow_duplicates=False, return_raw=False, unlimited_data=False, animate_by=None, cleaners=[], **kwargs): """ Helper function to return data for 'chart-data' & 'correlations-ts' endpoints. Will return a dictionary of dictionaries (one for each series) which contain the data for the x & y axes of the chart as well as the minimum & maximum of all the series for the y-axis. If there is only one series (no group_col specified) the only key in the dictionary of series data will be 'all' otherwise the keys will be the values of the groups. :param raw_data: dataframe to be used for chart :type raw_data: :class:`pandas:pandas.DataFrame` :param x: column to be used as x-axis of chart :type x: str :param y: column to be used as y-axis of chart :type y: list of strings :param group: comma-separated string of columns to group chart data by :type group: str, optional :param agg: points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: str, optional :param extended_aggregation: list of configurations that point to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: list, optional :param allow_duplicates: flag to allow duplicates to be ignored (usually for scatter plots) :type allow_duplicates: bool, optional :return: dict """ group_fmt_overrides = { "I": lambda v, as_string: json_int(v, as_string=as_string, fmt="{}") } data, code = retrieve_chart_data(raw_data, x, y, kwargs.get("z"), group_col, animate_by, group_val=group_val) cleaners = cleaners or [] if len(cleaners): for col in data.columns: if classify_type(find_dtype(data[col])) == "S": code.append("s = chart_data['{}']".format(col)) cleaned_col, cleaned_code = handle_cleaners( data[col], ",".join(cleaners)) data.loc[:, col] = cleaned_col code += cleaned_code code.append("chart_data.loc[:, '{}'] = s".format(col)) x_col = str("x") if x is None: x = x_col data.loc[:, x_col] = range(1, len(data) + 1) # sequential integers: 1, 2, ..., N y_cols = make_list(y) z_col = kwargs.get("z") z_cols = make_list(z_col) y_cols = [str(col) for col in y_cols] is_z = len(z_cols) > 0 y_group_cols = y_cols if is_z else [] sort_cols = y_group_cols final_cols = y_cols + z_cols if group_col is not None and len(group_col): for col in make_list(group_col): classifier = classify_type(find_dtype(data[col])) if classifier == "F" or (classifier == "I" and group_type == "bins"): if bin_type == "width": data.loc[:, col] = pd.qcut(data[col], q=bins_val, duplicates="drop").astype("str") code.append(( "chart_data.loc[:, '{col}'] = pd.qcut(chart_data['{col}'], q={bins}, duplicates=\"drop\")" ).format(col=col, bins=bins_val)) else: bins_data = data[col].dropna() npt = len(bins_data) equal_freq_bins = np.interp( np.linspace(0, npt, bins_val + 1), np.arange(npt), np.sort(bins_data), ) data.loc[:, col] = pd.cut(data[col], bins=equal_freq_bins, duplicates="drop").astype("str") code.append(( "bins_data = data['{col}'].dropna()\n" "npt = len(bins_data)\n" "equal_freq_bins = np.interp(np.linspace(0, npt, {bins}), np.arange(npt), " "np.sort(bins_data))\n" "chart_data.loc[:, '{col}'] = pd.cut(chart_data['{col}'], bins=equal_freq_bins, " 'duplicates="drop")').format(col=col, bins=bins_val + 1)) main_group = group_col if animate_by is not None: main_group = [animate_by] + main_group sort_cols = main_group + [x] + sort_cols data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data) data = data.rename(columns={x: x_col}) code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") if agg is not None or len(extended_aggregation): data, agg_code, final_cols = build_agg_data( data, x_col, y_cols, kwargs, agg, z=z_col, group_col=group_col, animate_by=animate_by, extended_aggregation=extended_aggregation, ) code += agg_code group_vals = data[group_col].drop_duplicates() if len(group_vals) > MAX_GROUPS: dtypes = get_dtypes(group_vals) group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } group_f, _ = build_formatters(group_vals) group_vals = group_f.format_lists(group_vals) group_vals = pd.DataFrame(group_vals, columns=group_col) msg = ( "Group ({}) contains more than {} unique values, more groups than that will make the chart unreadable. " 'You can choose specific groups to display from then "Group(s)" dropdown above. The available group(s) ' "are listed below:\n\n{}").format( ", ".join(group_col), MAX_GROUPS, group_vals.to_string(index=False)) raise ChartBuildingError(msg, group_vals.to_string(index=False)) data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") data_f, range_f = build_formatters(data) ret_data = dict( data={}, min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + final_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + final_cols }, ) dtypes = get_dtypes(data) group_fmts = { c: find_dtype_formatter(dtypes[c], overrides=group_fmt_overrides) for c in group_col } def _load_groups(df): for group_val, grp in df.groupby(group_col): def _group_filter(): for gv, gc in zip(make_list(group_val), group_col): classifier = classify_type(dtypes[gc]) yield group_filter_handler( gc, group_fmts[gc](gv, as_string=True), classifier) final_group_filter, final_group_label = [], [] for gf, gl in _group_filter(): final_group_filter.append(gf) final_group_label.append(gl) group_filter = " and ".join(final_group_filter) group_label = "({})".format(", ".join(final_group_label)) data = data_f.format_lists(grp) data["_filter_"] = group_filter yield group_label, data if animate_by is not None: frame_fmt = find_dtype_formatter(dtypes[animate_by], overrides=group_fmt_overrides) ret_data["frames"] = [] for frame_key, frame in data.sort_values(animate_by).groupby( animate_by): ret_data["frames"].append( dict( data=dict(_load_groups(frame)), name=frame_fmt(frame_key, as_string=True), )) ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"]) else: ret_data["data"] = dict(_load_groups(data)) return ret_data, code main_group = [x] if animate_by is not None: main_group = [animate_by] + main_group sort_cols = main_group + sort_cols data = data.sort_values(sort_cols) code.append("chart_data = chart_data.sort_values(['{cols}'])".format( cols="', '".join(sort_cols))) check_all_nan(data) data = data[main_group + final_cols] data = data.rename(columns={x: x_col}) main_group = [x_col if c == x else c for c in main_group] code.append("chart_data = chart_data.rename(columns={'" + x + "': '" + x_col + "'})") # convert booleans into integers for aggregation for col in z_cols or y_cols: classifier = classify_type(find_dtype(data[col])) if classifier == "B": data.loc[:, col] = data[col].astype("int") if agg is not None or len(extended_aggregation): data, agg_code, final_cols = build_agg_data( data, x_col, y_cols, kwargs, agg, z=z_col, animate_by=animate_by, extended_aggregation=extended_aggregation, ) code += agg_code data = data.dropna() if return_raw: return data.rename(columns={x_col: x}) code.append("chart_data = chart_data.dropna()") dupe_cols = main_group + y_group_cols data_limit = global_state.get_chart_settings( )["3d_points" if is_z or animate_by is not None else "scatter_points"] check_exceptions( data[dupe_cols].rename(columns={x_col: x}), allow_duplicates or agg in ["raw", "drop_duplicates"], unlimited_data=unlimited_data, data_limit=data_limit, ) data_f, range_f = build_formatters(data) ret_data = dict( min={ col: fmt(data[col].min(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_group_cols + final_cols }, max={ col: fmt(data[col].max(), None) for _, col, fmt in range_f.fmts if col in [x_col] + y_group_cols + final_cols }, ) if animate_by is not None: frame_fmt = find_dtype_formatter(find_dtype(data[animate_by]), overrides=group_fmt_overrides) ret_data["frames"] = [] for frame_key, frame in data.sort_values(animate_by).groupby( animate_by): ret_data["frames"].append( dict( data={str("all"): data_f.format_lists(frame)}, name=frame_fmt(frame_key, as_string=True), )) ret_data["data"] = copy.deepcopy(ret_data["frames"][-1]["data"]) else: ret_data["data"] = {str("all"): data_f.format_lists(data)} return ret_data, code