def get_correlations_ts(): """ Flask route which returns timeseries of Pearson correlations of two columns with numeric data using :meth:`pandas:pandas.DataFrame.corr` :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :param cols: comma-separated string from flask.request.args['cols'] containing names of two columns in dataframe :param dateCol: string from flask.request.args['dateCol'] with name of date-type column in dateframe for timeseries :returns: JSON { data: {:col1:col2: {data: [{corr: 0.99, date: 'YYYY-MM-DD'},...], max: 0.99, min: 0.99} } or {error: 'Exception message', traceback: 'Exception stacktrace'} """ try: query = get_str_arg(request, 'query') data = DATA[get_port()] data = data.query(query) if query is not None else data cols = get_str_arg(request, 'cols') cols = cols.split(',') date_col = get_str_arg(request, 'dateCol') data = data.groupby(date_col)[list(set(cols))].corr(method='pearson') data.index.names = ['date', 'column'] data = data.reset_index() data = data[data.column == cols[0]][['date', cols[1]]] data.columns = ['date', 'corr'] data = { k: v for k, v in _build_timeseries_chart_data('corr', data, ['corr']) } return jsonify(dict(data=data)) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def test_getters(builtin_pkg): req = build_req_tuple( {'int': '1', 'empty_int': '', 'str': 'hello', 'empty_str': '', 'bool': 'true', 'float': '1.1'} ) val = utils.get_str_arg(req, 'str') assert isinstance(val, str) and val == 'hello' val = utils.get_str_arg(req, 'str_def', default='def') assert val == 'def' val = utils.get_str_arg(req, 'empty_str') assert val is None with mock.patch('{}.str'.format(builtin_pkg), mock.Mock(side_effect=Exception)): val = utils.get_str_arg(req, 'str', default='def') assert val == 'def' val = utils.get_int_arg(req, 'int') assert isinstance(val, int) and val == 1 val = utils.get_int_arg(req, 'int_def', default=2) assert val == 2 val = utils.get_int_arg(req, 'empty_int') assert val is None with mock.patch('{}.int'.format(builtin_pkg), mock.Mock(side_effect=Exception)): val = utils.get_int_arg(req, 'int', default=2) assert val == 2 val = utils.get_bool_arg(req, 'bool') assert isinstance(val, bool) and val val = utils.get_float_arg(req, 'float') assert isinstance(val, float) and val == 1.1 val = utils.get_float_arg(req, 'int_def', default=2.0) assert val == 2.0 val = utils.get_float_arg(req, 'empty_float') assert val is None with mock.patch('{}.float'.format(builtin_pkg), mock.Mock(side_effect=Exception)): val = utils.get_float_arg(req, 'float', default=2.0) assert val == 2
def get_histogram(data_id): """ :class:`flask:flask.Flask` route which returns output from numpy.histogram to front-end as JSON :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param col: string from flask.request.args['col'] containing name of a column in your dataframe :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :param bins: the number of bins to display in your histogram, options on the front-end are 5, 10, 20, 50 :returns: JSON {results: DATA, desc: output from pd.DataFrame[col].describe(), success: True/False} """ col = get_str_arg(request, 'col', 'values') query = get_str_arg(request, 'query') bins = get_int_arg(request, 'bins', 20) try: data = DATA[data_id] if query: data = data.query(query) selected_col = find_selected_column(data, col) data = data[~pd.isnull(data[selected_col])][[selected_col]] hist = np.histogram(data, bins=bins) desc = load_describe(data[selected_col]) return jsonify(data=[json_float(h) for h in hist[0]], labels=['{0:.1f}'.format(l) for l in hist[1]], desc=desc) except BaseException as e: return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
def __init__(self, req): self.category_col = get_str_arg(req, "categoryCol") self.category_agg = get_str_arg(req, "categoryAgg", "mean") self.aggs = [ "count", "sum" if self.category_agg == "pctsum" else self.category_agg, ] self.top = get_int_arg(req, "top")
def get_scatter(): """ Flask route which returns data used in correlation of two columns for scatter chart :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :param cols: comma-separated string from flask.request.args['cols'] containing names of two columns in dataframe :param dateCol: string from flask.request.args['dateCol'] with name of date-type column in dateframe for timeseries :param date: string from flask.request.args['date'] date value in dateCol to filter dataframe to :returns: JSON { data: [{col1: 0.123, col2: 0.123, index: 1},...,{col1: 0.123, col2: 0.123, index: N}], stats: { correlated: 50, only_in_s0: 1, only_in_s1: 2, pearson: 0.987, spearman: 0.879, } x: col1, y: col2 } or {error: 'Exception message', traceback: 'Exception stacktrace'} """ cols = get_str_arg(request, 'cols') cols = cols.split(',') query = get_str_arg(request, 'query') date = get_str_arg(request, 'date') date_col = get_str_arg(request, 'dateCol') try: data = DATA[get_port()] data = data[data[date_col] == date] if date else data if query: data = data.query(query) data = data[list(set(cols))].dropna(how='any') data[str('index')] = data.index s0 = data[cols[0]] s1 = data[cols[1]] pearson = s0.corr(s1, method='pearson') spearman = s0.corr(s1, method='spearman') stats = dict(pearson='N/A' if pd.isnull(pearson) else pearson, spearman='N/A' if pd.isnull(spearman) else spearman, correlated=len(data), only_in_s0=len(data[data[cols[0]].isnull()]), only_in_s1=len(data[data[cols[1]].isnull()])) if len(data) > 15000: return jsonify( stats=stats, error= 'Dataset exceeds 15,000 records, cannot render scatter. Please apply filter...' ) f = grid_formatter(grid_columns(data)) data = f.format_dicts(data.itertuples()) return jsonify(data=data, x=cols[0], y=cols[1], stats=stats) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def get_chart_data(data_id): """ :class:`flask:flask.Flask` route which builds data associated with a chart.js chart :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :param x: string from flask.request.args['x'] column to be used as x-axis of chart :param y: string from flask.request.args['y'] column to be used as y-axis of chart :param group: string from flask.request.args['group'] comma-separated string of columns to group chart data by :param agg: string from flask.request.args['agg'] points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :returns: JSON { data: { series1: { x: [x1, x2, ..., xN], y: [y1, y2, ..., yN] }, series2: { x: [x1, x2, ..., xN], y: [y1, y2, ..., yN] }, ..., seriesN: { x: [x1, x2, ..., xN], y: [y1, y2, ..., yN] }, }, min: minY, max: maxY, } or {error: 'Exception message', traceback: 'Exception stacktrace'} """ try: query = get_str_arg(request, 'query') data = DATA[data_id] if query: try: data = data.query(query) except BaseException as e: return jsonify(dict(error='Invalid query: {}'.format(str(e)))) if not len(data): return jsonify( dict(error='query "{}" found no data, please alter'.format( query))) x = get_str_arg(request, 'x') y = get_json_arg(request, 'y') group_col = get_json_arg(request, 'group') agg = get_str_arg(request, 'agg') allow_duplicates = get_bool_arg(request, 'allowDupes') window = get_int_arg(request, 'rollingWin') comp = get_str_arg(request, 'rollingComp') data = build_chart(data, x, y, group_col, agg, allow_duplicates, rolling_win=window, rolling_comp=comp) data['success'] = True return jsonify(data) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def get_port(): """ Helper function to grab port information (SERVER_PORT) from Flask.request.environ """ return get_str_arg(request, 'port', request.environ.get('SERVER_PORT', 'curr'))
def get_correlations(data_id): """ :class:`flask:flask.Flask` route which gathers Pearson correlations against all combinations of columns with numeric data using :meth:`pandas:pandas.DataFrame.corr` On large datasets with no :attr:`numpy:numpy.nan` data this code will use :meth:`numpy:numpy.corrcoef` for speed purposes :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :returns: JSON { data: [{column: col1, col1: 1.0, col2: 0.99, colN: 0.45},...,{column: colN, col1: 0.34, col2: 0.88, colN: 1.0}], } or {error: 'Exception message', traceback: 'Exception stacktrace'} """ try: query = get_str_arg(request, 'query') data = DATA[data_id] data = data.query(query) if query is not None else data valid_corr_cols = [] valid_date_cols = [] rolling = False for col_info in DTYPES[data_id]: name, dtype = map(col_info.get, ['name', 'dtype']) dtype = classify_type(dtype) if dtype in ['I', 'F']: valid_corr_cols.append(name) elif dtype == 'D': # even if a datetime column exists, we need to make sure that there is enough data for a date # to warrant a correlation, https://github.com/man-group/dtale/issues/43 date_counts = data[name].dropna().value_counts() if len(date_counts[date_counts > 1]) > 1: valid_date_cols.append(name) elif date_counts.eq(1).all(): valid_date_cols.append(name) rolling = True if data[valid_corr_cols].isnull().values.any(): data = data.corr(method='pearson') else: # using pandas.corr proved to be quite slow on large datasets so I moved to numpy: # https://stackoverflow.com/questions/48270953/pandas-corr-and-corrwith-very-slow data = np.corrcoef(data[valid_corr_cols].values, rowvar=False) data = pd.DataFrame(data, columns=valid_corr_cols, index=valid_corr_cols) data.index.name = str('column') data = data.reset_index() col_types = grid_columns(data) f = grid_formatter(col_types, nan_display=None) return jsonify(data=f.format_dicts(data.itertuples()), dates=valid_date_cols, rolling=rolling) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def __init__(self, data_id, req): self.data_id = data_id self.analysis_type = get_str_arg(req, "type") curr_settings = global_state.get_settings(data_id) or {} self.query = build_query(data_id, curr_settings.get("query")) data = load_filterable_data(data_id, req, query=self.query) self.selected_col = find_selected_column( data, get_str_arg(req, "col", "values") ) self.data = data[~pd.isnull(data[self.selected_col])] self.dtype = find_dtype(self.data[self.selected_col]) self.classifier = classify_type(self.dtype) self.code = build_code_export( data_id, imports="{}\n".format( "\n".join( [ "import numpy as np", "import pandas as pd", "import plotly.graph_objs as go", ] ) ), ) if self.analysis_type is None: self.analysis_type = ( "histogram" if self.classifier in ["F", "I", "D"] else "value_counts" ) if self.analysis_type == "geolocation": self.analysis = GeolocationAnalysis(req) elif self.analysis_type == "histogram": self.analysis = HistogramAnalysis(req) elif self.analysis_type == "categories": self.analysis = CategoryAnalysis(req) elif self.analysis_type == "value_counts": self.analysis = ValueCountAnalysis(req) elif self.analysis_type == "word_value_counts": self.analysis = WordValueCountAnalysis(req) elif self.analysis_type == "qq": self.analysis = QQAnalysis()
def test_getters(builtin_pkg): req = build_req_tuple({ "int": "1", "empty_int": "", "str": "hello", "empty_str": "", "bool": "true", "float": "1.1", }) val = utils.get_str_arg(req, "str") assert isinstance(val, str) and val == "hello" val = utils.get_str_arg(req, "str_def", default="def") assert val == "def" val = utils.get_str_arg(req, "empty_str") assert val is None with mock.patch("{}.str".format(builtin_pkg), mock.Mock(side_effect=Exception)): val = utils.get_str_arg(req, "str", default="def") assert val == "def" val = utils.get_int_arg(req, "int") assert isinstance(val, int) and val == 1 val = utils.get_int_arg(req, "int_def", default=2) assert val == 2 val = utils.get_int_arg(req, "empty_int") assert val is None with mock.patch("{}.int".format(builtin_pkg), mock.Mock(side_effect=Exception)): val = utils.get_int_arg(req, "int", default=2) assert val == 2 val = utils.get_bool_arg(req, "bool") assert isinstance(val, bool) and val val = utils.get_float_arg(req, "float") assert isinstance(val, float) and val == 1.1 val = utils.get_float_arg(req, "int_def", default=2.0) assert val == 2.0 val = utils.get_float_arg(req, "empty_float") assert val is None with mock.patch("{}.float".format(builtin_pkg), mock.Mock(side_effect=Exception)): val = utils.get_float_arg(req, "float", default=2.0) assert val == 2
def get_correlations_ts(data_id): """ :class:`flask:flask.Flask` route which returns timeseries of Pearson correlations of two columns with numeric data using :meth:`pandas:pandas.DataFrame.corr` :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :param cols: comma-separated string from flask.request.args['cols'] containing names of two columns in dataframe :param dateCol: string from flask.request.args['dateCol'] with name of date-type column in dateframe for timeseries :returns: JSON { data: {:col1:col2: {data: [{corr: 0.99, date: 'YYYY-MM-DD'},...], max: 0.99, min: 0.99} } or {error: 'Exception message', traceback: 'Exception stacktrace'} """ try: query = get_str_arg(request, 'query') data = DATA[data_id] data = data.query(query) if query is not None else data cols = get_str_arg(request, 'cols') cols = json.loads(cols) date_col = get_str_arg(request, 'dateCol') rolling_window = get_int_arg(request, 'rollingWindow') if rolling_window: [col1, col2] = list(set(cols)) data = data[[date_col, col1, col2]].set_index(date_col) data = data[[col1, col2]].rolling(rolling_window).corr().reset_index() data = data.dropna() data = data[data['level_1'] == col1][[date_col, col2]] else: data = data.groupby(date_col)[list( set(cols))].corr(method='pearson') data.index.names = ['date', 'column'] data = data.reset_index() data = data[data.column == cols[0]][['date', cols[1]]] data.columns = ['date', 'corr'] return_data = build_chart(data.fillna(0), 'date', 'corr') return_data['success'] = True return jsonify(return_data) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def test_filter(): """ Flask route which will test out pandas query before it gets applied to DATA and return exception information to the screen if there is any :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :return: JSON {success: True/False} """ try: query = get_str_arg(request, 'query') _test_filter(DATA[get_port()], query) return jsonify(dict(success=True)) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def get_correlations(): """ Flask route which gathers Pearson correlations against all combinations of columns with numeric data using :meth:`pandas:pandas.DataFrame.corr` On large datasets with no :attr:`numpy:numpy.nan` data this code will use :meth:`numpy:numpy.corrcoef` for speed purposes :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :returns: JSON { data: [{column: col1, col1: 1.0, col2: 0.99, colN: 0.45},...,{column: colN, col1: 0.34, col2: 0.88, colN: 1.0}], } or {error: 'Exception message', traceback: 'Exception stacktrace'} """ try: query = get_str_arg(request, 'query') port = get_port() data = DATA[port] data = data.query(query) if query is not None else data valid_corr_cols = [] valid_date_cols = [] for col_info in DTYPES[port]: name, dtype = map(col_info.get, ['name', 'dtype']) dtype = classify_type(dtype) if dtype in ['I', 'F']: valid_corr_cols.append(name) elif dtype == 'D' and len(data[name].dropna().unique()) > 1: valid_date_cols.append(name) if data[valid_corr_cols].isnull().values.any(): data = data.corr(method='pearson') else: # using pandas.corr proved to be quite slow on large datasets so I moved to numpy: # https://stackoverflow.com/questions/48270953/pandas-corr-and-corrwith-very-slow data = np.corrcoef(data[valid_corr_cols].values, rowvar=False) data = pd.DataFrame(data, columns=valid_corr_cols, index=valid_corr_cols) data.index.name = str('column') data = data.reset_index() col_types = grid_columns(data) f = grid_formatter(col_types, nan_display=None) return jsonify(data=f.format_dicts(data.itertuples()), dates=valid_date_cols) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def test_filter(data_id): """ :class:`flask:flask.Flask` route which will test out pandas query before it gets applied to DATA and return exception information to the screen if there is any :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :return: JSON {success: True/False} """ try: query = get_str_arg(request, 'query') _test_filter(DATA[data_id], query) return jsonify(dict(success=True)) except BaseException as e: return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
def filter_data(df, req, groups, query=None): filters = get_str_arg(req, 'filters') if not filters: return df.query(query or 'index == index'), groups, '' filters = json.loads(filters) col, prev_freq, freq, end = map(filters[-1].get, ['name', 'prevFreq', 'freq', 'date']) start = DATE_RANGES[prev_freq](pd.Timestamp(end)).strftime('%Y%m%d') range_query = "{col} >= '{start}' and {col} <= '{end}'".format( col=col, start=start, end=end) logger.info('filtered coverage data to slice: {}'.format(range_query)) updated_groups = [ dict(name=col, freq=freq) if g['name'] == col else g for g in groups ] return df.query( query or 'index == index').query(range_query), updated_groups, range_query
def update_settings(): """ Flask route which updates global SETTINGS for current port :param port: number string from flask.request.environ['SERVER_PORT'] :param settings: JSON string from flask.request.args['settings'] which gets decoded and stored in SETTINGS variable :return: JSON """ try: global SETTINGS port = get_port() curr_settings = SETTINGS.get(port, {}) updated_settings = dict_merge( curr_settings, json.loads(get_str_arg(request, 'settings', '{}'))) SETTINGS[port] = updated_settings return jsonify(dict(success=True)) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def get_correlations(): """ Flask route which gathers Pearson correlations against all combinations of columns with numeric data using pandas.DataFrame.corr :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :returns: JSON { data: [{column: col1, col1: 1.0, col2: 0.99, colN: 0.45},...,{column: colN, col1: 0.34, col2: 0.88, colN: 1.0}], } or {error: 'Exception message', traceback: 'Exception stacktrace'} """ try: query = get_str_arg(request, 'query') data = DATA.query(query) if query is not None else DATA data = data.corr(method='pearson') data.index.name = 'column' data = data.reset_index() col_types = grid_columns(data) f = grid_formatter(col_types, nan_display=None) return jsonify(data=f.format_dicts(data.itertuples())) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def get_data(): """ Flask route which returns current rows from DATA (based on scrollbar specs and saved settings) to front-end as JSON :param ids: required dash separated string "START-END" stating a range of row indexes to be returned to the screen :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :param sort: JSON string from flask.request.args['sort'] which is applied to DATA using the sort_values() or sort_index() function. Here is the JSON structure: [col1,dir1],[col2,dir2],....[coln,dirn] :param port: number string from flask.request.environ['SERVER_PORT'] for retrieving saved settings :return: JSON { results: [ {dtale_index: 1, col1: val1_1, ...,colN: valN_1}, ..., {dtale_index: N2, col1: val1_N2, ...,colN: valN_N2}. ], columns: [{name: col1, dtype: 'int64'},...,{name: colN, dtype: 'datetime'}], total: N2, success: True/False } """ try: global SETTINGS, DATA params = retrieve_grid_params(request) ids = get_str_arg(request, 'ids') if ids: ids = json.loads(ids) else: return jsonify({}) col_types = grid_columns(DATA) f = grid_formatter(col_types) curr_settings = SETTINGS.get( request.environ.get('SERVER_PORT', 'curr'), {}) if curr_settings.get('sort') != params.get('sort'): DATA = sort_df_for_grid(DATA, params) df = DATA if params.get('sort') is not None: curr_settings = dict_merge(curr_settings, dict(sort=params['sort'])) else: curr_settings = { k: v for k, v in curr_settings.items() if k != 'sort' } df = filter_df_for_grid(df, params) if params.get('query') is not None: curr_settings = dict_merge(curr_settings, dict(query=params['query'])) else: curr_settings = { k: v for k, v in curr_settings.items() if k != 'query' } SETTINGS[request.environ.get('SERVER_PORT', 'curr')] = curr_settings total = len(df) results = {} for sub_range in ids: sub_range = list(map(int, sub_range.split('-'))) if len(sub_range) == 1: sub_df = df.iloc[sub_range[0]:sub_range[0] + 1] sub_df = f.format_dicts(sub_df.itertuples()) results[sub_range[0]] = dict_merge( dict(dtale_index=sub_range[0]), sub_df[0]) else: [start, end] = sub_range sub_df = df.iloc[start:] if end >= len( df) - 1 else df.iloc[start:end + 1] sub_df = f.format_dicts(sub_df.itertuples()) for i, d in zip(range(start, end + 1), sub_df): results[i] = dict_merge(dict(dtale_index=i), d) return_data = dict(results=results, columns=[dict(name='dtale_index', dtype='int64')] + col_types, total=total) return jsonify(return_data) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def get_scatter(data_id): """ :class:`flask:flask.Flask` route which returns data used in correlation of two columns for scatter chart :param data_id: integer string identifier for a D-Tale process's data :type data_id: str :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :param cols: comma-separated string from flask.request.args['cols'] containing names of two columns in dataframe :param dateCol: string from flask.request.args['dateCol'] with name of date-type column in dateframe for timeseries :param date: string from flask.request.args['date'] date value in dateCol to filter dataframe to :returns: JSON { data: [{col1: 0.123, col2: 0.123, index: 1},...,{col1: 0.123, col2: 0.123, index: N}], stats: { stats: { correlated: 50, only_in_s0: 1, only_in_s1: 2, pearson: 0.987, spearman: 0.879, } x: col1, y: col2 } or {error: 'Exception message', traceback: 'Exception stacktrace'} """ cols = get_json_arg(request, 'cols') query = get_str_arg(request, 'query') date = get_str_arg(request, 'date') date_col = get_str_arg(request, 'dateCol') rolling = get_bool_arg(request, 'rolling') try: data = DATA[data_id] if query: data = data.query(query) idx_col = str('index') y_cols = [cols[1], idx_col] if rolling: window = get_int_arg(request, 'window') idx = min(data[data[date_col] == date].index) + 1 data = data.iloc[max(idx - window, 0):idx] data = data[list(set(cols)) + [date_col]].dropna(how='any') y_cols.append(date_col) else: data = data[data[date_col] == date] if date else data data = data[list(set(cols))].dropna(how='any') data[idx_col] = data.index s0 = data[cols[0]] s1 = data[cols[1]] pearson = s0.corr(s1, method='pearson') spearman = s0.corr(s1, method='spearman') stats = dict( pearson='N/A' if pd.isnull(pearson) else pearson, spearman='N/A' if pd.isnull(spearman) else spearman, correlated=len(data), only_in_s0=len(data[data[cols[0]].isnull()]), only_in_s1=len(data[data[cols[1]].isnull()]) ) if len(data) > 15000: return jsonify( stats=stats, error='Dataset exceeds 15,000 records, cannot render scatter. Please apply filter...' ) data = build_chart(data, cols[0], y_cols, allow_duplicates=True) data['x'] = cols[0] data['y'] = cols[1] data['stats'] = stats return jsonify(data) except BaseException as e: return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
def find_coverage(): """ Flask route which returns coverage information(counts) for a column grouped by other column(s) :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :param col: string from flask.request.args['col'] containing name of a column in your dataframe :param filters(deprecated): JSON string from flaks.request.args['filters'] with filtering information from group drilldown [ {name: col1, prevFreq: Y, freq: Q, date: YYYY-MM-DD}, ... {name: col1, prevFreq: D, freq: W, date: YYYY-MM-DD}, ] :param group: JSON string from flask.request.args['group'] containing grouping logic in this structure [ {name: col1} or {name: date_col1, freq: [D,W,M,Q,Y]} ] :returns: JSON { data: { [col]: [count1,count2,...,countN], labels: [{group_col1: gc1_v1, group_col2: gc2_v1},...,{group_col1: gc1_vN, group_col2: gc2_vN}], success: True } or {error: 'Exception message', traceback: 'Exception stacktrace', success: False} """ def filter_data(df, req, groups, query=None): filters = get_str_arg(req, 'filters') if not filters: return df.query(query or 'index == index'), groups, '' filters = json.loads(filters) col, prev_freq, freq, end = map(filters[-1].get, ['name', 'prevFreq', 'freq', 'date']) start = DATE_RANGES[prev_freq](pd.Timestamp(end)).strftime('%Y%m%d') range_query = "{col} >= '{start}' and {col} <= '{end}'".format( col=col, start=start, end=end) logger.info('filtered coverage data to slice: {}'.format(range_query)) updated_groups = [ dict(name=col, freq=freq) if g['name'] == col else g for g in groups ] return df.query( query or 'index == index').query(range_query), updated_groups, range_query try: col = get_str_arg(request, 'col') groups = get_str_arg(request, 'group') if groups: groups = json.loads(groups) data = DATA[get_port()] data, groups, query = filter_data(data, request, groups, query=get_str_arg(request, 'query')) grouper = [] for g_cfg in groups: if 'freq' in g_cfg: freq_grp = data.set_index([g_cfg['name']]).index.to_period( g_cfg['freq']).to_timestamp(how='end') freq_grp.name = g_cfg['name'] grouper.append(freq_grp) else: grouper.append(data[g_cfg['name']]) data_groups = data.groupby(grouper) group_data = data_groups[col].count() if len(groups) > 1: unstack_order = enumerate( zip(group_data.index.names, group_data.index.levels)) unstack_order = sorted([(uo[0], uo[1][0], len(uo[1][1])) for uo in unstack_order], key=lambda k: k[2]) for i, n, l in unstack_order[:-1]: group_data = group_data.unstack(i) group_data = group_data.fillna(0) if len(unstack_order[:-1]) > 1: group_data.columns = [ ', '.join([ str(group_data.columns.levels[c2[0]][c2[1]]) for c2 in enumerate(c) ]) for c in zip(*group_data.columns.labels) ] else: group_data.columns = map(str, group_data.columns.values) if len(group_data) > 15000: return jsonify( dict(error=( 'Your grouping created {} groups, chart will not render. ' 'Try making date columns a higher frequency (W, M, Q, Y)' ).format(len(data_groups)))) if len(groups) == 1: data = {col: [json_int(v) for v in group_data.values]} else: data = dict([(c, [json_int(v) for v in group_data[c].values]) for c in group_data.columns]) labels = pd.DataFrame(group_data.index.values, columns=group_data.index.names) labels_f_overrides = { 'D': lambda f, i, c: f.add_date(i, c, fmt='%Y-%m-%d'), } labels_f = grid_formatter(grid_columns(labels), overrides=labels_f_overrides) labels = labels_f.format_dicts(labels.itertuples()) return jsonify(data=data, labels=labels, success=True) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def __init__(self, req): self.bins = get_int_arg(req, "bins", 20) self.target = get_str_arg(req, "target") self.density = get_bool_arg(req, "density")
def __init__(self, req): self.bins = get_int_arg(req, "bins", 20) self.target = get_str_arg(req, "target")
def get_data(): """ Flask route which returns current rows from DATA (based on scrollbar specs and saved settings) to front-end as JSON :param ids: required dash separated string "START-END" stating a range of row indexes to be returned to the screen :param query: string from flask.request.args['query'] which is applied to DATA using the query() function :param sort: JSON string from flask.request.args['sort'] which is applied to DATA using the sort_values() or sort_index() function. Here is the JSON structure: [col1,dir1],[col2,dir2],....[coln,dirn] :param port: number string from flask.request.environ['SERVER_PORT'] for retrieving saved settings :return: JSON { results: [ {dtale_index: 1, col1: val1_1, ...,colN: valN_1}, ..., {dtale_index: N2, col1: val1_N2, ...,colN: valN_N2} ], columns: [{name: col1, dtype: 'int64'},...,{name: colN, dtype: 'datetime'}], total: N2, success: True/False } """ try: global SETTINGS, DATA, DTYPES port = get_port() data = DATA[port] # this will check for when someone instantiates D-Tale programatically and directly alters the internal # state of the dataframe (EX: d.data['new_col'] = 'foo') curr_dtypes = [c['name'] for c in DTYPES[port]] if any(c not in curr_dtypes for c in data.columns): data, _ = format_data(data) DATA[port] = data DTYPES[port] = build_dtypes_state(data) params = retrieve_grid_params(request) ids = get_str_arg(request, 'ids') if ids: ids = json.loads(ids) else: return jsonify({}) col_types = DTYPES[port] f = grid_formatter(col_types) curr_settings = SETTINGS.get(port, {}) if curr_settings.get('sort') != params.get('sort'): data = sort_df_for_grid(data, params) DATA[port] = data if params.get('sort') is not None: curr_settings = dict_merge(curr_settings, dict(sort=params['sort'])) else: curr_settings = { k: v for k, v in curr_settings.items() if k != 'sort' } data = filter_df_for_grid(data, params) if params.get('query') is not None: curr_settings = dict_merge(curr_settings, dict(query=params['query'])) else: curr_settings = { k: v for k, v in curr_settings.items() if k != 'query' } SETTINGS[port] = curr_settings total = len(data) results = {} for sub_range in ids: sub_range = list(map(int, sub_range.split('-'))) if len(sub_range) == 1: sub_df = data.iloc[sub_range[0]:sub_range[0] + 1] sub_df = f.format_dicts(sub_df.itertuples()) results[sub_range[0]] = dict_merge({IDX_COL: sub_range[0]}, sub_df[0]) else: [start, end] = sub_range sub_df = data.iloc[start:] if end >= len( data) - 1 else data.iloc[start:end + 1] sub_df = f.format_dicts(sub_df.itertuples()) for i, d in zip(range(start, end + 1), sub_df): results[i] = dict_merge({IDX_COL: i}, d) return_data = dict(results=results, columns=[dict(name=IDX_COL, dtype='int64')] + DTYPES[port], total=total) return jsonify(return_data) except BaseException as e: return jsonify( dict(error=str(e), traceback=str(traceback.format_exc())))
def __init__(self, req): self.top = get_int_arg(req, "top") self.ordinal_col = get_str_arg(req, "ordinalCol") self.ordinal_agg = get_str_arg(req, "ordinalAgg", "sum") self.cleaners = get_str_arg(req, "cleaner")
def __init__(self, req): self.lat_col = get_str_arg(req, "latCol") self.lon_col = get_str_arg(req, "lonCol")