Exemple #1
0
def get_correlations_ts():
    """
    Flask route which returns timeseries of Pearson correlations of two columns with numeric data
    using :meth:`pandas:pandas.DataFrame.corr`

    :param query: string from flask.request.args['query'] which is applied to DATA using the query() function
    :param cols: comma-separated string from flask.request.args['cols'] containing names of two columns in dataframe
    :param dateCol: string from flask.request.args['dateCol'] with name of date-type column in dateframe for timeseries
    :returns: JSON {
        data: {:col1:col2: {data: [{corr: 0.99, date: 'YYYY-MM-DD'},...], max: 0.99, min: 0.99}
    } or {error: 'Exception message', traceback: 'Exception stacktrace'}
    """
    try:
        query = get_str_arg(request, 'query')
        data = DATA[get_port()]
        data = data.query(query) if query is not None else data
        cols = get_str_arg(request, 'cols')
        cols = cols.split(',')
        date_col = get_str_arg(request, 'dateCol')
        data = data.groupby(date_col)[list(set(cols))].corr(method='pearson')
        data.index.names = ['date', 'column']
        data = data.reset_index()
        data = data[data.column == cols[0]][['date', cols[1]]]
        data.columns = ['date', 'corr']
        data = {
            k: v
            for k, v in _build_timeseries_chart_data('corr', data, ['corr'])
        }
        return jsonify(dict(data=data))
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
Exemple #2
0
def test_getters(builtin_pkg):
    req = build_req_tuple(
        {'int': '1', 'empty_int': '', 'str': 'hello', 'empty_str': '', 'bool': 'true', 'float': '1.1'}
    )
    val = utils.get_str_arg(req, 'str')
    assert isinstance(val, str) and val == 'hello'
    val = utils.get_str_arg(req, 'str_def', default='def')
    assert val == 'def'
    val = utils.get_str_arg(req, 'empty_str')
    assert val is None
    with mock.patch('{}.str'.format(builtin_pkg), mock.Mock(side_effect=Exception)):
        val = utils.get_str_arg(req, 'str', default='def')
        assert val == 'def'
    val = utils.get_int_arg(req, 'int')
    assert isinstance(val, int) and val == 1
    val = utils.get_int_arg(req, 'int_def', default=2)
    assert val == 2
    val = utils.get_int_arg(req, 'empty_int')
    assert val is None
    with mock.patch('{}.int'.format(builtin_pkg), mock.Mock(side_effect=Exception)):
        val = utils.get_int_arg(req, 'int', default=2)
        assert val == 2
    val = utils.get_bool_arg(req, 'bool')
    assert isinstance(val, bool) and val

    val = utils.get_float_arg(req, 'float')
    assert isinstance(val, float) and val == 1.1
    val = utils.get_float_arg(req, 'int_def', default=2.0)
    assert val == 2.0
    val = utils.get_float_arg(req, 'empty_float')
    assert val is None
    with mock.patch('{}.float'.format(builtin_pkg), mock.Mock(side_effect=Exception)):
        val = utils.get_float_arg(req, 'float', default=2.0)
        assert val == 2
Exemple #3
0
def get_histogram(data_id):
    """
    :class:`flask:flask.Flask` route which returns output from numpy.histogram to front-end as JSON

    :param data_id: integer string identifier for a D-Tale process's data
    :type data_id: str
    :param col: string from flask.request.args['col'] containing name of a column in your dataframe
    :param query: string from flask.request.args['query'] which is applied to DATA using the query() function
    :param bins: the number of bins to display in your histogram, options on the front-end are 5, 10, 20, 50
    :returns: JSON {results: DATA, desc: output from pd.DataFrame[col].describe(), success: True/False}
    """
    col = get_str_arg(request, 'col', 'values')
    query = get_str_arg(request, 'query')
    bins = get_int_arg(request, 'bins', 20)
    try:
        data = DATA[data_id]
        if query:
            data = data.query(query)

        selected_col = find_selected_column(data, col)
        data = data[~pd.isnull(data[selected_col])][[selected_col]]
        hist = np.histogram(data, bins=bins)

        desc = load_describe(data[selected_col])
        return jsonify(data=[json_float(h) for h in hist[0]], labels=['{0:.1f}'.format(l) for l in hist[1]], desc=desc)
    except BaseException as e:
        return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
Exemple #4
0
 def __init__(self, req):
     self.category_col = get_str_arg(req, "categoryCol")
     self.category_agg = get_str_arg(req, "categoryAgg", "mean")
     self.aggs = [
         "count",
         "sum" if self.category_agg == "pctsum" else self.category_agg,
     ]
     self.top = get_int_arg(req, "top")
Exemple #5
0
def get_scatter():
    """
    Flask route which returns data used in correlation of two columns for scatter chart

    :param query: string from flask.request.args['query'] which is applied to DATA using the query() function
    :param cols: comma-separated string from flask.request.args['cols'] containing names of two columns in dataframe
    :param dateCol: string from flask.request.args['dateCol'] with name of date-type column in dateframe for timeseries
    :param date: string from flask.request.args['date'] date value in dateCol to filter dataframe to
    :returns: JSON {
        data: [{col1: 0.123, col2: 0.123, index: 1},...,{col1: 0.123, col2: 0.123, index: N}],
        stats: {
            correlated: 50,
            only_in_s0: 1,
            only_in_s1: 2,
            pearson: 0.987,
            spearman: 0.879,
        }
        x: col1,
        y: col2
    } or {error: 'Exception message', traceback: 'Exception stacktrace'}
    """
    cols = get_str_arg(request, 'cols')
    cols = cols.split(',')
    query = get_str_arg(request, 'query')
    date = get_str_arg(request, 'date')
    date_col = get_str_arg(request, 'dateCol')
    try:
        data = DATA[get_port()]
        data = data[data[date_col] == date] if date else data
        if query:
            data = data.query(query)

        data = data[list(set(cols))].dropna(how='any')
        data[str('index')] = data.index
        s0 = data[cols[0]]
        s1 = data[cols[1]]
        pearson = s0.corr(s1, method='pearson')
        spearman = s0.corr(s1, method='spearman')
        stats = dict(pearson='N/A' if pd.isnull(pearson) else pearson,
                     spearman='N/A' if pd.isnull(spearman) else spearman,
                     correlated=len(data),
                     only_in_s0=len(data[data[cols[0]].isnull()]),
                     only_in_s1=len(data[data[cols[1]].isnull()]))

        if len(data) > 15000:
            return jsonify(
                stats=stats,
                error=
                'Dataset exceeds 15,000 records, cannot render scatter. Please apply filter...'
            )
        f = grid_formatter(grid_columns(data))
        data = f.format_dicts(data.itertuples())
        return jsonify(data=data, x=cols[0], y=cols[1], stats=stats)
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
Exemple #6
0
def get_chart_data(data_id):
    """
    :class:`flask:flask.Flask` route which builds data associated with a chart.js chart

    :param data_id: integer string identifier for a D-Tale process's data
    :type data_id: str
    :param query: string from flask.request.args['query'] which is applied to DATA using the query() function
    :param x: string from flask.request.args['x'] column to be used as x-axis of chart
    :param y: string from flask.request.args['y'] column to be used as y-axis of chart
    :param group: string from flask.request.args['group'] comma-separated string of columns to group chart data by
    :param agg: string from flask.request.args['agg'] points to a specific function that can be applied to
                :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                median, min, max, std, var, mad, prod, sum
    :returns: JSON {
        data: {
            series1: { x: [x1, x2, ..., xN], y: [y1, y2, ..., yN] },
            series2: { x: [x1, x2, ..., xN], y: [y1, y2, ..., yN] },
            ...,
            seriesN: { x: [x1, x2, ..., xN], y: [y1, y2, ..., yN] },
        },
        min: minY,
        max: maxY,
    } or {error: 'Exception message', traceback: 'Exception stacktrace'}
    """
    try:
        query = get_str_arg(request, 'query')
        data = DATA[data_id]
        if query:
            try:
                data = data.query(query)
            except BaseException as e:
                return jsonify(dict(error='Invalid query: {}'.format(str(e))))
            if not len(data):
                return jsonify(
                    dict(error='query "{}" found no data, please alter'.format(
                        query)))
        x = get_str_arg(request, 'x')
        y = get_json_arg(request, 'y')
        group_col = get_json_arg(request, 'group')
        agg = get_str_arg(request, 'agg')
        allow_duplicates = get_bool_arg(request, 'allowDupes')
        window = get_int_arg(request, 'rollingWin')
        comp = get_str_arg(request, 'rollingComp')
        data = build_chart(data,
                           x,
                           y,
                           group_col,
                           agg,
                           allow_duplicates,
                           rolling_win=window,
                           rolling_comp=comp)
        data['success'] = True
        return jsonify(data)
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
Exemple #7
0
def get_port():
    """
    Helper function to grab port information (SERVER_PORT) from Flask.request.environ

    """
    return get_str_arg(request, 'port',
                       request.environ.get('SERVER_PORT', 'curr'))
Exemple #8
0
def get_correlations(data_id):
    """
    :class:`flask:flask.Flask` route which gathers Pearson correlations against all combinations of columns with
    numeric data using :meth:`pandas:pandas.DataFrame.corr`

    On large datasets with no :attr:`numpy:numpy.nan` data this code will use :meth:`numpy:numpy.corrcoef`
    for speed purposes

    :param data_id: integer string identifier for a D-Tale process's data
    :type data_id: str
    :param query: string from flask.request.args['query'] which is applied to DATA using the query() function
    :returns: JSON {
        data: [{column: col1, col1: 1.0, col2: 0.99, colN: 0.45},...,{column: colN, col1: 0.34, col2: 0.88, colN: 1.0}],
    } or {error: 'Exception message', traceback: 'Exception stacktrace'}
    """
    try:
        query = get_str_arg(request, 'query')
        data = DATA[data_id]
        data = data.query(query) if query is not None else data

        valid_corr_cols = []
        valid_date_cols = []
        rolling = False
        for col_info in DTYPES[data_id]:
            name, dtype = map(col_info.get, ['name', 'dtype'])
            dtype = classify_type(dtype)
            if dtype in ['I', 'F']:
                valid_corr_cols.append(name)
            elif dtype == 'D':
                # even if a datetime column exists, we need to make sure that there is enough data for a date
                # to warrant a correlation, https://github.com/man-group/dtale/issues/43
                date_counts = data[name].dropna().value_counts()
                if len(date_counts[date_counts > 1]) > 1:
                    valid_date_cols.append(name)
                elif date_counts.eq(1).all():
                    valid_date_cols.append(name)
                    rolling = True

        if data[valid_corr_cols].isnull().values.any():
            data = data.corr(method='pearson')
        else:
            # using pandas.corr proved to be quite slow on large datasets so I moved to numpy:
            # https://stackoverflow.com/questions/48270953/pandas-corr-and-corrwith-very-slow
            data = np.corrcoef(data[valid_corr_cols].values, rowvar=False)
            data = pd.DataFrame(data,
                                columns=valid_corr_cols,
                                index=valid_corr_cols)

        data.index.name = str('column')
        data = data.reset_index()
        col_types = grid_columns(data)
        f = grid_formatter(col_types, nan_display=None)
        return jsonify(data=f.format_dicts(data.itertuples()),
                       dates=valid_date_cols,
                       rolling=rolling)
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
Exemple #9
0
    def __init__(self, data_id, req):
        self.data_id = data_id
        self.analysis_type = get_str_arg(req, "type")
        curr_settings = global_state.get_settings(data_id) or {}
        self.query = build_query(data_id, curr_settings.get("query"))
        data = load_filterable_data(data_id, req, query=self.query)
        self.selected_col = find_selected_column(
            data, get_str_arg(req, "col", "values")
        )
        self.data = data[~pd.isnull(data[self.selected_col])]
        self.dtype = find_dtype(self.data[self.selected_col])
        self.classifier = classify_type(self.dtype)
        self.code = build_code_export(
            data_id,
            imports="{}\n".format(
                "\n".join(
                    [
                        "import numpy as np",
                        "import pandas as pd",
                        "import plotly.graph_objs as go",
                    ]
                )
            ),
        )

        if self.analysis_type is None:
            self.analysis_type = (
                "histogram" if self.classifier in ["F", "I", "D"] else "value_counts"
            )

        if self.analysis_type == "geolocation":
            self.analysis = GeolocationAnalysis(req)
        elif self.analysis_type == "histogram":
            self.analysis = HistogramAnalysis(req)
        elif self.analysis_type == "categories":
            self.analysis = CategoryAnalysis(req)
        elif self.analysis_type == "value_counts":
            self.analysis = ValueCountAnalysis(req)
        elif self.analysis_type == "word_value_counts":
            self.analysis = WordValueCountAnalysis(req)
        elif self.analysis_type == "qq":
            self.analysis = QQAnalysis()
Exemple #10
0
def test_getters(builtin_pkg):
    req = build_req_tuple({
        "int": "1",
        "empty_int": "",
        "str": "hello",
        "empty_str": "",
        "bool": "true",
        "float": "1.1",
    })
    val = utils.get_str_arg(req, "str")
    assert isinstance(val, str) and val == "hello"
    val = utils.get_str_arg(req, "str_def", default="def")
    assert val == "def"
    val = utils.get_str_arg(req, "empty_str")
    assert val is None
    with mock.patch("{}.str".format(builtin_pkg),
                    mock.Mock(side_effect=Exception)):
        val = utils.get_str_arg(req, "str", default="def")
        assert val == "def"
    val = utils.get_int_arg(req, "int")
    assert isinstance(val, int) and val == 1
    val = utils.get_int_arg(req, "int_def", default=2)
    assert val == 2
    val = utils.get_int_arg(req, "empty_int")
    assert val is None
    with mock.patch("{}.int".format(builtin_pkg),
                    mock.Mock(side_effect=Exception)):
        val = utils.get_int_arg(req, "int", default=2)
        assert val == 2
    val = utils.get_bool_arg(req, "bool")
    assert isinstance(val, bool) and val

    val = utils.get_float_arg(req, "float")
    assert isinstance(val, float) and val == 1.1
    val = utils.get_float_arg(req, "int_def", default=2.0)
    assert val == 2.0
    val = utils.get_float_arg(req, "empty_float")
    assert val is None
    with mock.patch("{}.float".format(builtin_pkg),
                    mock.Mock(side_effect=Exception)):
        val = utils.get_float_arg(req, "float", default=2.0)
        assert val == 2
Exemple #11
0
def get_correlations_ts(data_id):
    """
    :class:`flask:flask.Flask` route which returns timeseries of Pearson correlations of two columns with numeric data
    using :meth:`pandas:pandas.DataFrame.corr`

    :param data_id: integer string identifier for a D-Tale process's data
    :type data_id: str
    :param query: string from flask.request.args['query'] which is applied to DATA using the query() function
    :param cols: comma-separated string from flask.request.args['cols'] containing names of two columns in dataframe
    :param dateCol: string from flask.request.args['dateCol'] with name of date-type column in dateframe for timeseries
    :returns: JSON {
        data: {:col1:col2: {data: [{corr: 0.99, date: 'YYYY-MM-DD'},...], max: 0.99, min: 0.99}
    } or {error: 'Exception message', traceback: 'Exception stacktrace'}
    """
    try:
        query = get_str_arg(request, 'query')
        data = DATA[data_id]
        data = data.query(query) if query is not None else data
        cols = get_str_arg(request, 'cols')
        cols = json.loads(cols)
        date_col = get_str_arg(request, 'dateCol')
        rolling_window = get_int_arg(request, 'rollingWindow')
        if rolling_window:
            [col1, col2] = list(set(cols))
            data = data[[date_col, col1, col2]].set_index(date_col)
            data = data[[col1,
                         col2]].rolling(rolling_window).corr().reset_index()
            data = data.dropna()
            data = data[data['level_1'] == col1][[date_col, col2]]
        else:
            data = data.groupby(date_col)[list(
                set(cols))].corr(method='pearson')
            data.index.names = ['date', 'column']
            data = data.reset_index()
            data = data[data.column == cols[0]][['date', cols[1]]]
        data.columns = ['date', 'corr']
        return_data = build_chart(data.fillna(0), 'date', 'corr')
        return_data['success'] = True
        return jsonify(return_data)
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
Exemple #12
0
def test_filter():
    """
    Flask route which will test out pandas query before it gets applied to DATA and return exception information to the
    screen if there is any

    :param query: string from flask.request.args['query'] which is applied to DATA using the query() function
    :return: JSON {success: True/False}
    """
    try:
        query = get_str_arg(request, 'query')
        _test_filter(DATA[get_port()], query)
        return jsonify(dict(success=True))
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
Exemple #13
0
def get_correlations():
    """
    Flask route which gathers Pearson correlations against all combinations of columns with numeric data
    using :meth:`pandas:pandas.DataFrame.corr`

    On large datasets with no :attr:`numpy:numpy.nan` data this code will use :meth:`numpy:numpy.corrcoef`
    for speed purposes

    :param query: string from flask.request.args['query'] which is applied to DATA using the query() function
    :returns: JSON {
        data: [{column: col1, col1: 1.0, col2: 0.99, colN: 0.45},...,{column: colN, col1: 0.34, col2: 0.88, colN: 1.0}],
    } or {error: 'Exception message', traceback: 'Exception stacktrace'}
    """
    try:
        query = get_str_arg(request, 'query')
        port = get_port()
        data = DATA[port]
        data = data.query(query) if query is not None else data

        valid_corr_cols = []
        valid_date_cols = []
        for col_info in DTYPES[port]:
            name, dtype = map(col_info.get, ['name', 'dtype'])
            dtype = classify_type(dtype)
            if dtype in ['I', 'F']:
                valid_corr_cols.append(name)
            elif dtype == 'D' and len(data[name].dropna().unique()) > 1:
                valid_date_cols.append(name)

        if data[valid_corr_cols].isnull().values.any():
            data = data.corr(method='pearson')
        else:
            # using pandas.corr proved to be quite slow on large datasets so I moved to numpy:
            # https://stackoverflow.com/questions/48270953/pandas-corr-and-corrwith-very-slow
            data = np.corrcoef(data[valid_corr_cols].values, rowvar=False)
            data = pd.DataFrame(data,
                                columns=valid_corr_cols,
                                index=valid_corr_cols)

        data.index.name = str('column')
        data = data.reset_index()
        col_types = grid_columns(data)
        f = grid_formatter(col_types, nan_display=None)
        return jsonify(data=f.format_dicts(data.itertuples()),
                       dates=valid_date_cols)
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
Exemple #14
0
def test_filter(data_id):
    """
    :class:`flask:flask.Flask` route which will test out pandas query before it gets applied to DATA and return
    exception information to the screen if there is any

    :param data_id: integer string identifier for a D-Tale process's data
    :type data_id: str
    :param query: string from flask.request.args['query'] which is applied to DATA using the query() function
    :return: JSON {success: True/False}
    """
    try:
        query = get_str_arg(request, 'query')
        _test_filter(DATA[data_id], query)
        return jsonify(dict(success=True))
    except BaseException as e:
        return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
Exemple #15
0
 def filter_data(df, req, groups, query=None):
     filters = get_str_arg(req, 'filters')
     if not filters:
         return df.query(query or 'index == index'), groups, ''
     filters = json.loads(filters)
     col, prev_freq, freq, end = map(filters[-1].get,
                                     ['name', 'prevFreq', 'freq', 'date'])
     start = DATE_RANGES[prev_freq](pd.Timestamp(end)).strftime('%Y%m%d')
     range_query = "{col} >= '{start}' and {col} <= '{end}'".format(
         col=col, start=start, end=end)
     logger.info('filtered coverage data to slice: {}'.format(range_query))
     updated_groups = [
         dict(name=col, freq=freq) if g['name'] == col else g
         for g in groups
     ]
     return df.query(
         query or
         'index == index').query(range_query), updated_groups, range_query
Exemple #16
0
def update_settings():
    """
    Flask route which updates global SETTINGS for current port

    :param port: number string from flask.request.environ['SERVER_PORT']
    :param settings: JSON string from flask.request.args['settings'] which gets decoded and stored in SETTINGS variable
    :return: JSON
    """
    try:
        global SETTINGS

        port = get_port()
        curr_settings = SETTINGS.get(port, {})
        updated_settings = dict_merge(
            curr_settings, json.loads(get_str_arg(request, 'settings', '{}')))
        SETTINGS[port] = updated_settings
        return jsonify(dict(success=True))
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
Exemple #17
0
def get_correlations():
    """
    Flask route which gathers Pearson correlations against all combinations of columns with numeric data
    using pandas.DataFrame.corr

    :param query: string from flask.request.args['query'] which is applied to DATA using the query() function
    :returns: JSON {
        data: [{column: col1, col1: 1.0, col2: 0.99, colN: 0.45},...,{column: colN, col1: 0.34, col2: 0.88, colN: 1.0}],
    } or {error: 'Exception message', traceback: 'Exception stacktrace'}
    """
    try:
        query = get_str_arg(request, 'query')
        data = DATA.query(query) if query is not None else DATA
        data = data.corr(method='pearson')
        data.index.name = 'column'
        data = data.reset_index()
        col_types = grid_columns(data)
        f = grid_formatter(col_types, nan_display=None)
        return jsonify(data=f.format_dicts(data.itertuples()))
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
Exemple #18
0
def get_data():
    """
    Flask route which returns current rows from DATA (based on scrollbar specs and saved settings) to front-end as
    JSON

    :param ids: required dash separated string "START-END" stating a range of row indexes to be returned to the screen
    :param query: string from flask.request.args['query'] which is applied to DATA using the query() function
    :param sort: JSON string from flask.request.args['sort'] which is applied to DATA using the sort_values() or
                 sort_index() function.  Here is the JSON structure: [col1,dir1],[col2,dir2],....[coln,dirn]
    :param port: number string from flask.request.environ['SERVER_PORT'] for retrieving saved settings
    :return: JSON {
        results: [
            {dtale_index: 1, col1: val1_1, ...,colN: valN_1},
            ...,
            {dtale_index: N2, col1: val1_N2, ...,colN: valN_N2}.
        ],
        columns: [{name: col1, dtype: 'int64'},...,{name: colN, dtype: 'datetime'}],
        total: N2,
        success: True/False
    }
    """
    try:
        global SETTINGS, DATA

        params = retrieve_grid_params(request)
        ids = get_str_arg(request, 'ids')
        if ids:
            ids = json.loads(ids)
        else:
            return jsonify({})
        col_types = grid_columns(DATA)

        f = grid_formatter(col_types)
        curr_settings = SETTINGS.get(
            request.environ.get('SERVER_PORT', 'curr'), {})
        if curr_settings.get('sort') != params.get('sort'):
            DATA = sort_df_for_grid(DATA, params)
        df = DATA
        if params.get('sort') is not None:
            curr_settings = dict_merge(curr_settings,
                                       dict(sort=params['sort']))
        else:
            curr_settings = {
                k: v
                for k, v in curr_settings.items() if k != 'sort'
            }
        df = filter_df_for_grid(df, params)
        if params.get('query') is not None:
            curr_settings = dict_merge(curr_settings,
                                       dict(query=params['query']))
        else:
            curr_settings = {
                k: v
                for k, v in curr_settings.items() if k != 'query'
            }
        SETTINGS[request.environ.get('SERVER_PORT', 'curr')] = curr_settings

        total = len(df)
        results = {}
        for sub_range in ids:
            sub_range = list(map(int, sub_range.split('-')))
            if len(sub_range) == 1:
                sub_df = df.iloc[sub_range[0]:sub_range[0] + 1]
                sub_df = f.format_dicts(sub_df.itertuples())
                results[sub_range[0]] = dict_merge(
                    dict(dtale_index=sub_range[0]), sub_df[0])
            else:
                [start, end] = sub_range
                sub_df = df.iloc[start:] if end >= len(
                    df) - 1 else df.iloc[start:end + 1]
                sub_df = f.format_dicts(sub_df.itertuples())
                for i, d in zip(range(start, end + 1), sub_df):
                    results[i] = dict_merge(dict(dtale_index=i), d)
        return_data = dict(results=results,
                           columns=[dict(name='dtale_index', dtype='int64')] +
                           col_types,
                           total=total)
        return jsonify(return_data)
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
Exemple #19
0
def get_scatter(data_id):
    """
    :class:`flask:flask.Flask` route which returns data used in correlation of two columns for scatter chart

    :param data_id: integer string identifier for a D-Tale process's data
    :type data_id: str
    :param query: string from flask.request.args['query'] which is applied to DATA using the query() function
    :param cols: comma-separated string from flask.request.args['cols'] containing names of two columns in dataframe
    :param dateCol: string from flask.request.args['dateCol'] with name of date-type column in dateframe for timeseries
    :param date: string from flask.request.args['date'] date value in dateCol to filter dataframe to
    :returns: JSON {
        data: [{col1: 0.123, col2: 0.123, index: 1},...,{col1: 0.123, col2: 0.123, index: N}],
        stats: {
        stats: {
            correlated: 50,
            only_in_s0: 1,
            only_in_s1: 2,
            pearson: 0.987,
            spearman: 0.879,
        }
        x: col1,
        y: col2
    } or {error: 'Exception message', traceback: 'Exception stacktrace'}
    """
    cols = get_json_arg(request, 'cols')
    query = get_str_arg(request, 'query')
    date = get_str_arg(request, 'date')
    date_col = get_str_arg(request, 'dateCol')
    rolling = get_bool_arg(request, 'rolling')
    try:
        data = DATA[data_id]
        if query:
            data = data.query(query)

        idx_col = str('index')
        y_cols = [cols[1], idx_col]
        if rolling:
            window = get_int_arg(request, 'window')
            idx = min(data[data[date_col] == date].index) + 1
            data = data.iloc[max(idx - window, 0):idx]
            data = data[list(set(cols)) + [date_col]].dropna(how='any')
            y_cols.append(date_col)
        else:
            data = data[data[date_col] == date] if date else data
            data = data[list(set(cols))].dropna(how='any')

        data[idx_col] = data.index
        s0 = data[cols[0]]
        s1 = data[cols[1]]
        pearson = s0.corr(s1, method='pearson')
        spearman = s0.corr(s1, method='spearman')
        stats = dict(
            pearson='N/A' if pd.isnull(pearson) else pearson,
            spearman='N/A' if pd.isnull(spearman) else spearman,
            correlated=len(data),
            only_in_s0=len(data[data[cols[0]].isnull()]),
            only_in_s1=len(data[data[cols[1]].isnull()])
        )

        if len(data) > 15000:
            return jsonify(
                stats=stats,
                error='Dataset exceeds 15,000 records, cannot render scatter. Please apply filter...'
            )
        data = build_chart(data, cols[0], y_cols, allow_duplicates=True)
        data['x'] = cols[0]
        data['y'] = cols[1]
        data['stats'] = stats
        return jsonify(data)
    except BaseException as e:
        return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
Exemple #20
0
def find_coverage():
    """
    Flask route which returns coverage information(counts) for a column grouped by other column(s)

    :param query: string from flask.request.args['query'] which is applied to DATA using the query() function
    :param col: string from flask.request.args['col'] containing name of a column in your dataframe
    :param filters(deprecated): JSON string from flaks.request.args['filters'] with filtering information from group
           drilldown [
        {name: col1, prevFreq: Y, freq: Q, date: YYYY-MM-DD},
        ...
        {name: col1, prevFreq: D, freq: W, date: YYYY-MM-DD},
    ]
    :param group: JSON string from flask.request.args['group'] containing grouping logic in this structure [
        {name: col1} or {name: date_col1, freq: [D,W,M,Q,Y]}
    ]
    :returns: JSON {
        data: {
            [col]: [count1,count2,...,countN],
            labels: [{group_col1: gc1_v1, group_col2: gc2_v1},...,{group_col1: gc1_vN, group_col2: gc2_vN}],
            success: True
    } or {error: 'Exception message', traceback: 'Exception stacktrace', success: False}
    """
    def filter_data(df, req, groups, query=None):
        filters = get_str_arg(req, 'filters')
        if not filters:
            return df.query(query or 'index == index'), groups, ''
        filters = json.loads(filters)
        col, prev_freq, freq, end = map(filters[-1].get,
                                        ['name', 'prevFreq', 'freq', 'date'])
        start = DATE_RANGES[prev_freq](pd.Timestamp(end)).strftime('%Y%m%d')
        range_query = "{col} >= '{start}' and {col} <= '{end}'".format(
            col=col, start=start, end=end)
        logger.info('filtered coverage data to slice: {}'.format(range_query))
        updated_groups = [
            dict(name=col, freq=freq) if g['name'] == col else g
            for g in groups
        ]
        return df.query(
            query or
            'index == index').query(range_query), updated_groups, range_query

    try:
        col = get_str_arg(request, 'col')
        groups = get_str_arg(request, 'group')
        if groups:
            groups = json.loads(groups)
        data = DATA[get_port()]
        data, groups, query = filter_data(data,
                                          request,
                                          groups,
                                          query=get_str_arg(request, 'query'))
        grouper = []
        for g_cfg in groups:
            if 'freq' in g_cfg:
                freq_grp = data.set_index([g_cfg['name']]).index.to_period(
                    g_cfg['freq']).to_timestamp(how='end')
                freq_grp.name = g_cfg['name']
                grouper.append(freq_grp)
            else:
                grouper.append(data[g_cfg['name']])

        data_groups = data.groupby(grouper)
        group_data = data_groups[col].count()
        if len(groups) > 1:
            unstack_order = enumerate(
                zip(group_data.index.names, group_data.index.levels))
            unstack_order = sorted([(uo[0], uo[1][0], len(uo[1][1]))
                                    for uo in unstack_order],
                                   key=lambda k: k[2])
            for i, n, l in unstack_order[:-1]:
                group_data = group_data.unstack(i)
            group_data = group_data.fillna(0)
            if len(unstack_order[:-1]) > 1:
                group_data.columns = [
                    ', '.join([
                        str(group_data.columns.levels[c2[0]][c2[1]])
                        for c2 in enumerate(c)
                    ]) for c in zip(*group_data.columns.labels)
                ]
            else:
                group_data.columns = map(str, group_data.columns.values)

        if len(group_data) > 15000:
            return jsonify(
                dict(error=(
                    'Your grouping created {} groups, chart will not render. '
                    'Try making date columns a higher frequency (W, M, Q, Y)'
                ).format(len(data_groups))))
        if len(groups) == 1:
            data = {col: [json_int(v) for v in group_data.values]}
        else:
            data = dict([(c, [json_int(v) for v in group_data[c].values])
                         for c in group_data.columns])
        labels = pd.DataFrame(group_data.index.values,
                              columns=group_data.index.names)
        labels_f_overrides = {
            'D': lambda f, i, c: f.add_date(i, c, fmt='%Y-%m-%d'),
        }
        labels_f = grid_formatter(grid_columns(labels),
                                  overrides=labels_f_overrides)
        labels = labels_f.format_dicts(labels.itertuples())
        return jsonify(data=data, labels=labels, success=True)
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
Exemple #21
0
 def __init__(self, req):
     self.bins = get_int_arg(req, "bins", 20)
     self.target = get_str_arg(req, "target")
     self.density = get_bool_arg(req, "density")
Exemple #22
0
 def __init__(self, req):
     self.bins = get_int_arg(req, "bins", 20)
     self.target = get_str_arg(req, "target")
Exemple #23
0
def get_data():
    """
    Flask route which returns current rows from DATA (based on scrollbar specs and saved settings) to front-end as
    JSON

    :param ids: required dash separated string "START-END" stating a range of row indexes to be returned to the screen
    :param query: string from flask.request.args['query'] which is applied to DATA using the query() function
    :param sort: JSON string from flask.request.args['sort'] which is applied to DATA using the sort_values() or
                 sort_index() function.  Here is the JSON structure: [col1,dir1],[col2,dir2],....[coln,dirn]
    :param port: number string from flask.request.environ['SERVER_PORT'] for retrieving saved settings
    :return: JSON {
        results: [
            {dtale_index: 1, col1: val1_1, ...,colN: valN_1},
            ...,
            {dtale_index: N2, col1: val1_N2, ...,colN: valN_N2}
        ],
        columns: [{name: col1, dtype: 'int64'},...,{name: colN, dtype: 'datetime'}],
        total: N2,
        success: True/False
    }
    """
    try:
        global SETTINGS, DATA, DTYPES
        port = get_port()
        data = DATA[port]

        # this will check for when someone instantiates D-Tale programatically and directly alters the internal
        # state of the dataframe (EX: d.data['new_col'] = 'foo')
        curr_dtypes = [c['name'] for c in DTYPES[port]]
        if any(c not in curr_dtypes for c in data.columns):
            data, _ = format_data(data)
            DATA[port] = data
            DTYPES[port] = build_dtypes_state(data)

        params = retrieve_grid_params(request)
        ids = get_str_arg(request, 'ids')
        if ids:
            ids = json.loads(ids)
        else:
            return jsonify({})

        col_types = DTYPES[port]
        f = grid_formatter(col_types)
        curr_settings = SETTINGS.get(port, {})
        if curr_settings.get('sort') != params.get('sort'):
            data = sort_df_for_grid(data, params)
            DATA[port] = data
        if params.get('sort') is not None:
            curr_settings = dict_merge(curr_settings,
                                       dict(sort=params['sort']))
        else:
            curr_settings = {
                k: v
                for k, v in curr_settings.items() if k != 'sort'
            }
        data = filter_df_for_grid(data, params)
        if params.get('query') is not None:
            curr_settings = dict_merge(curr_settings,
                                       dict(query=params['query']))
        else:
            curr_settings = {
                k: v
                for k, v in curr_settings.items() if k != 'query'
            }
        SETTINGS[port] = curr_settings

        total = len(data)
        results = {}
        for sub_range in ids:
            sub_range = list(map(int, sub_range.split('-')))
            if len(sub_range) == 1:
                sub_df = data.iloc[sub_range[0]:sub_range[0] + 1]
                sub_df = f.format_dicts(sub_df.itertuples())
                results[sub_range[0]] = dict_merge({IDX_COL: sub_range[0]},
                                                   sub_df[0])
            else:
                [start, end] = sub_range
                sub_df = data.iloc[start:] if end >= len(
                    data) - 1 else data.iloc[start:end + 1]
                sub_df = f.format_dicts(sub_df.itertuples())
                for i, d in zip(range(start, end + 1), sub_df):
                    results[i] = dict_merge({IDX_COL: i}, d)
        return_data = dict(results=results,
                           columns=[dict(name=IDX_COL, dtype='int64')] +
                           DTYPES[port],
                           total=total)
        return jsonify(return_data)
    except BaseException as e:
        return jsonify(
            dict(error=str(e), traceback=str(traceback.format_exc())))
Exemple #24
0
 def __init__(self, req):
     self.top = get_int_arg(req, "top")
     self.ordinal_col = get_str_arg(req, "ordinalCol")
     self.ordinal_agg = get_str_arg(req, "ordinalAgg", "sum")
     self.cleaners = get_str_arg(req, "cleaner")
Exemple #25
0
 def __init__(self, req):
     self.lat_col = get_str_arg(req, "latCol")
     self.lon_col = get_str_arg(req, "lonCol")