def test_select_dtypes_exclude_using_scalars(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc')), 'g': pd.date_range('20130101', periods=3), 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'i': pd.date_range('20130101', periods=3, tz='CET'), 'j': pd.period_range('2013-01', periods=3, freq='M'), 'k': pd.timedelta_range('1 day', periods=3)}) ri = df.select_dtypes(exclude=np.number) ei = df[['a', 'e', 'f', 'g', 'h', 'i', 'j']] assert_frame_equal(ri, ei) ri = df.select_dtypes(exclude='category') ei = df[['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k']] assert_frame_equal(ri, ei) pytest.raises(NotImplementedError, lambda: df.select_dtypes(exclude='period'))
def test_select_dtypes_include_exclude_mixed_scalars_lists(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc')), 'g': pd.date_range('20130101', periods=3), 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'i': pd.date_range('20130101', periods=3, tz='CET'), 'j': pd.period_range('2013-01', periods=3, freq='M'), 'k': pd.timedelta_range('1 day', periods=3)}) ri = df.select_dtypes(include=np.number, exclude=['floating', 'timedelta']) ei = df[['b', 'c']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number, 'category'], exclude='floating') ei = df[['b', 'c', 'f', 'k']] assert_frame_equal(ri, ei)
def test_select_dtypes_bad_arg_raises(self): df = DataFrame({'a': list('abc'), 'g': list(u('abc')), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('now', periods=3).values}) with tm.assertRaisesRegexp(TypeError, 'data type.*not understood'): df.select_dtypes(['blargy, blarg, blarg'])
def test_select_dtypes_bad_datetime64(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('now', periods=3).values}) with tm.assert_raises_regex(ValueError, '.+ is too specific'): df.select_dtypes(include=['datetime64[D]']) with tm.assert_raises_regex(ValueError, '.+ is too specific'): df.select_dtypes(exclude=['datetime64[as]'])
def test_select_dtypes_str_raises(self, dtype, arg): df = DataFrame({"a": list("abc"), "g": list(u("abc")), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values}) msg = "string dtypes are not allowed" kwargs = {arg: [dtype]} with tm.assert_raises_regex(TypeError, msg): df.select_dtypes(**kwargs)
def test_select_dtypes_include(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc'))}) ri = df.select_dtypes(include=[np.number]) ei = df[['b', 'c', 'd']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number, 'category']) ei = df[['b', 'c', 'd', 'f']] assert_frame_equal(ri, ei)
def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('now', periods=3).values}) df['g'] = df.f.diff() assert not hasattr(np, 'u8') r = df.select_dtypes(include=['i8', 'O'], exclude=['timedelta']) e = df[['a', 'b']] assert_frame_equal(r, e) r = df.select_dtypes(include=['i8', 'O', 'timedelta64[ns]']) e = df[['a', 'b', 'g']] assert_frame_equal(r, e)
def test_select_dtypes_exclude_using_list_like(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True]}) re = df.select_dtypes(exclude=[np.number]) ee = df[['a', 'e']] assert_frame_equal(re, ee)
def test_select_dtypes_exclude_include_using_list_like(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('now', periods=3).values}) exclude = np.datetime64, include = np.bool_, 'integer' r = df.select_dtypes(include=include, exclude=exclude) e = df[['b', 'c', 'e']] assert_frame_equal(r, e) exclude = 'datetime', include = 'bool', 'int64', 'int32' r = df.select_dtypes(include=include, exclude=exclude) e = df[['b', 'e']] assert_frame_equal(r, e)
def test_select_dtypes_str_raises(self): df = DataFrame({'a': list('abc'), 'g': list(u('abc')), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('now', periods=3).values}) string_dtypes = set((str, 'str', np.string_, 'S1', 'unicode', np.unicode_, 'U1')) try: string_dtypes.add(unicode) except NameError: pass for dt in string_dtypes: with tm.assert_raises_regex(TypeError, 'string dtypes are not allowed'): df.select_dtypes(include=[dt]) with tm.assert_raises_regex(TypeError, 'string dtypes are not allowed'): df.select_dtypes(exclude=[dt])
def test_select_dtypes_raises_on_string(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))}) with tm.assertRaisesRegexp(TypeError, 'include and exclude .+ non-'): df.select_dtypes(include='object') with tm.assertRaisesRegexp(TypeError, 'include and exclude .+ non-'): df.select_dtypes(exclude='object') with tm.assertRaisesRegexp(TypeError, 'include and exclude .+ non-'): df.select_dtypes(include=int, exclude='object')
def deserialize(self, item, force_bytes_to_unicode=False): index = self._index_from_records(item) column_fields = [x for x in item.dtype.names if x not in item.dtype.metadata['index']] multi_column = item.dtype.metadata.get('multi_column') if len(item) == 0: rdata = item[column_fields] if len(column_fields) > 0 else None if multi_column is not None: columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"]) return DataFrame(rdata, index=index, columns=columns) else: return DataFrame(rdata, index=index) columns = item.dtype.metadata['columns'] df = DataFrame(data=item[column_fields], index=index, columns=columns) if multi_column is not None: df.columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"]) if force_bytes_to_unicode: # This is needed due to 'str' type in py2 when read back in py3 is 'bytes' which breaks the workflow # of people migrating to py3. # https://github.com/manahl/arctic/issues/598 # This should not be used for a normal flow, and you should instead of writing unicode strings # if you want to work with str in py3., for c in df.select_dtypes(object): # The conversion is not using astype similar to the index as pandas has a bug where it tries to convert # the data columns to a unicode string, and the object in this case would be bytes, eg. b'abc' # which is converted to u"b'abc'" i.e it includes the b character as well! This generally happens # when there is a str conversion without specifying the encoding. eg. str(b'abc') -> "b'abc'" and the # fix for this is to tell it the encoding to use: i.e str(b'abc', 'utf-8') -> "abc" if type(df[c].iloc[0]) == bytes: df[c] = df[c].str.decode('utf-8') if isinstance(df.index, MultiIndex): unicode_indexes = [] # MultiIndex requires a conversion at each level. for level in range(len(df.index.levels)): _index = df.index.get_level_values(level) if isinstance(_index[0], bytes): _index = _index.astype('unicode') unicode_indexes.append(_index) df.index = unicode_indexes else: if type(df.index[0]) == bytes: df.index = df.index.astype('unicode') if type(df.columns[0]) == bytes: df.columns = df.index.astype('unicode') return df
def test_select_dtypes_include(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc')), 'g': pd.date_range('20130101', periods=3), 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'i': pd.date_range('20130101', periods=3, tz='CET'), 'j': pd.period_range('2013-01', periods=3, freq='M'), 'k': pd.timedelta_range('1 day', periods=3)}) ri = df.select_dtypes(include=[np.number]) ei = df[['b', 'c', 'd', 'k']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number], exclude=['timedelta']) ei = df[['b', 'c', 'd']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number, 'category'], exclude=['timedelta']) ei = df[['b', 'c', 'd', 'f']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=['datetime']) ei = df[['g']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=['datetime64']) ei = df[['g']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=['datetimetz']) ei = df[['h', 'i']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=['timedelta']) ei = df[['k']] assert_frame_equal(ri, ei) self.assertRaises(NotImplementedError, lambda: df.select_dtypes(include=['period']))
def test_select_dtypes_duplicate_columns(self): # GH20839 odict = compat.OrderedDict df = DataFrame(odict([('a', list('abc')), ('b', list(range(1, 4))), ('c', np.arange(3, 6).astype('u1')), ('d', np.arange(4.0, 7.0, dtype='float64')), ('e', [True, False, True]), ('f', pd.date_range('now', periods=3).values)])) df.columns = ['a', 'a', 'b', 'b', 'b', 'c'] expected = DataFrame({'a': list(range(1, 4)), 'b': np.arange(3, 6).astype('u1')}) result = df.select_dtypes(include=[np.number], exclude=['floating']) assert_frame_equal(result, expected)
def remove_discrete_variables_with_too_many_states(df: pd.DataFrame, num_states = 30): column_names = df.select_dtypes(include=['object']).apply(lambda x: len(x.unique()) >= num_states) cols = list(set(df.columns.tolist()) - set(column_names[column_names == True].index.tolist())) return df[cols]
def test_select_dtypes_empty(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))}) msg = 'at least one of include or exclude must be nonempty' with pytest.raises(ValueError, match=msg): df.select_dtypes()
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) target_names = None if target_column is not None and prediction_column is not None: reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) #plot confusion matrix conf_matrix = metrics.confusion_matrix( reference_data[target_column], reference_data[prediction_column]) z = conf_matrix.astype(int) labels = target_names if target_names else sorted( set(reference_data[target_column])) # change each element of z to type string for annotations z_text = [[str(y) for y in x] for x in z] fig = ff.create_annotated_heatmap(z, x=labels, y=labels, annotation_text=z_text, colorscale='bluered', showscale=True) fig.update_layout(xaxis_title="Predicted value", yaxis_title="Actual value") conf_matrix_json = json.loads(fig.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1 if production_data is not None else 2, params={ "data": conf_matrix_json['data'], "layout": conf_matrix_json['layout'] }, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [date_column, id_column, target_column, prediction_column] num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if prediction_column is not None: #calculate output drift reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) #ref_feature_vc = reference_data[prediction_column][np.isfinite(reference_data[prediction_column])].value_counts() #prod_feature_vc = production_data[prediction_column][np.isfinite(production_data[prediction_column])].value_counts() #keys = set(list(reference_data[prediction_column][np.isfinite(reference_data[prediction_column])].unique()) + # list(production_data[prediction_column][np.isfinite(production_data[prediction_column])].unique())) ref_feature_vc = reference_data[prediction_column].value_counts() prod_feature_vc = production_data[prediction_column].value_counts() keys = set(list(reference_data[prediction_column].unique()) + list(production_data[prediction_column].unique())) ref_feature_dict = dict.fromkeys(keys, 0) for key, item in zip(ref_feature_vc.index, ref_feature_vc.values): ref_feature_dict[key] = item prod_feature_dict = dict.fromkeys(keys, 0) for key, item in zip(prod_feature_vc.index, prod_feature_vc.values): prod_feature_dict[key] = item f_exp = [value[1] for value in sorted(ref_feature_dict.items())] f_obs = [value[1] for value in sorted(prod_feature_dict.items())] pred_p_value = chisquare(f_exp, f_obs)[1] pred_sim_test = "detected" if pred_p_value < 0.05 else "not detected" #plot output distributions fig = go.Figure() fig.add_trace(go.Histogram(x=reference_data[prediction_column], marker_color=grey, opacity=0.6, nbinsx=10, name='Reference', histnorm='probability')) fig.add_trace(go.Histogram(x=production_data[prediction_column], marker_color=red, opacity=0.6,nbinsx=10, name='Current', histnorm='probability')) fig.update_layout( legend = dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ), xaxis_title = prediction_column, yaxis_title = "Share" ) pred_drift_json = json.loads(fig.to_json()) self.wi = BaseWidgetInfo( title="Prediction Drift: " + pred_sim_test + ", p_value=" + str(round(pred_p_value, 6)), type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "data": pred_drift_json['data'], "layout": pred_drift_json['layout'] }, additionalGraphs=[], ) else: self.wi = None
def get_numeric_columns(df: pd.DataFrame) -> List[str]: cols = df.select_dtypes([np.number]).columns return cols.tolist()
def test_select_dtypes_empty(self): df = DataFrame({"a": list("abc"), "b": list(range(1, 4))}) msg = "at least one of include or exclude must be nonempty" with pytest.raises(ValueError, match=msg): df.select_dtypes()
def fit(self, X: pd.DataFrame, y: pd.Series): """ Find the important features. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input dataframe y : array-like of shape (n_samples) Target variable. Required to train the estimator. Returns ------- self """ # check input dataframe X = _is_dataframe(X) # check variables self.variables = _find_all_variables(X, self.variables) # check if df contains na _check_contains_na(X, self.variables) # limit df to variables to smooth code below X = X[self.variables].copy() # find categorical and numerical variables self.variables_categorical_ = list( X.select_dtypes(include="O").columns) self.variables_numerical_ = list( X.select_dtypes(include=["float", "integer"]).columns) # obtain cross-validation indeces skf = StratifiedKFold(n_splits=self.cv, shuffle=True, random_state=self.random_state) skf.get_n_splits(X, y) if self.variables_categorical_ and self.variables_numerical_: _pipeline = self._make_combined_pipeline() elif self.variables_categorical_: _pipeline = self._make_categorical_pipeline() else: _pipeline = self._make_numerical_pipeline() # obtain feature performance with cross-validation feature_importances_cv = [] for train_index, test_index in skf.split(X, y): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y[train_index], y[test_index] _pipeline.fit(X_train, y_train) X_test = _pipeline.transform(X_test) if self.scoring == "roc_auc_score": tmp_split = { f: roc_auc_score(y_test, X_test[f]) for f in self.variables } else: tmp_split = { f: r2_score(y_test, X_test[f]) for f in self.variables } feature_importances_cv.append(pd.Series(tmp_split)) feature_importances_cv = pd.concat(feature_importances_cv, axis=1) self.feature_performance_ = feature_importances_cv.mean( # type: ignore axis=1).to_dict() self.selected_features_ = [ f for f in self.variables if self.feature_performance_[f] > self.threshold ] self.input_shape_ = X.shape return self
def dist_plot( data: pd.DataFrame, mean_color: str = "orange", figsize: Tuple = (16, 2), fill_range: Tuple = (0.025, 0.975), showall: bool = False, kde_kws: Dict[str, Any] = None, rug_kws: Dict[str, Any] = None, fill_kws: Dict[str, Any] = None, font_kws: Dict[str, Any] = None, ): """ Two-dimensional visualization of the distribution of non binary numerical features. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame \ is provided, the index/column information is used to label the plots mean_color : str, optional Color of the vertical line indicating the mean of the data, by default "orange" figsize : Tuple, optional Controls the figure size, by default (16, 2) fill_range : Tuple, optional Set the quantiles for shading. Default spans 95% of the data, which is about \ two std. deviations above and below the mean, by default (0.025, 0.975) showall : bool, optional Set to True to remove the output limit of 20 plots, by default False kde_kws : Dict[str, Any], optional Keyword arguments for kdeplot(), by default {"color": "k", "alpha": 0.7, \ "linewidth": 1.5, "bw": 0.3} rug_kws : Dict[str, Any], optional Keyword arguments for rugplot(), by default {"color": "#ff3333", \ "alpha": 0.05, "linewidth": 4, "height": 0.075} fill_kws : Dict[str, Any], optional Keyword arguments to control the fill, by default {"color": "#80d4ff", \ "alpha": 0.2} font_kws : Dict[str, Any], optional Keyword arguments to control the font, by default {"color": "#111111", \ "weight": "normal", "size": 11} Returns ------- ax: matplotlib Axes Returns the Axes object with the plot for further tweaking. """ # Handle dictionary defaults kde_kws = ({ "alpha": 0.75, "linewidth": 1.5, "bw": 0.4 } if kde_kws is None else kde_kws.copy()) rug_kws = ({ "color": "#ff3333", "alpha": 0.05, "linewidth": 4, "height": 0.075 } if rug_kws is None else rug_kws.copy()) fill_kws = ({ "color": "#80d4ff", "alpha": 0.2 } if fill_kws is None else fill_kws.copy()) font_kws = ({ "color": "#111111", "weight": "normal", "size": 11 } if font_kws is None else font_kws.copy()) data = pd.DataFrame(data.copy()).dropna(axis=1, how="all") data = data.loc[:, data.nunique() > 2] cols = list(data.select_dtypes(include=["number"]).columns) data = data[cols] data = data.loc[:, data.nunique() > 2] if len(cols) == 0: print("No columns with numeric data were detected.") return elif len(cols) >= 20 and showall is False: print( "Note: The number of non binary numerical features is very large " f"({len(cols)}), please consider splitting the data. Showing plots for " "the first 20 numerical features. Override this by setting showall=True." ) cols = cols[:20] for col in cols: num_dropped_vals = data[col].isna().sum() if num_dropped_vals > 0: col_data = data[col].dropna(axis=0) print( f"Dropped {num_dropped_vals} missing values from column {col}." ) else: col_data = data[col] _, ax = plt.subplots(figsize=figsize) ax = sns.distplot( col_data, hist=False, rug=True, kde_kws=kde_kws, rug_kws=rug_kws, ) # Vertical lines and fill x, y = ax.lines[0].get_xydata().T ax.fill_between( x, y, where=((x >= np.quantile(col_data, fill_range[0])) & (x <= np.quantile(col_data, fill_range[1]))), label=f"{fill_range[0]*100:.1f}% - {fill_range[1]*100:.1f}%", **fill_kws, ) mean = np.mean(col_data) std = scipy.stats.tstd(col_data) ax.vlines( x=mean, ymin=0, ymax=np.interp(mean, x, y), ls="dotted", color=mean_color, lw=2, label="mean", ) ax.vlines( x=np.median(col_data), ymin=0, ymax=np.interp(np.median(col_data), x, y), ls=":", color=".3", label="median", ) ax.vlines( x=[mean - std, mean + std], ymin=0, ymax=[np.interp(mean - std, x, y), np.interp(mean + std, x, y)], ls=":", color=".5", label="\u03BC \u00B1 \u03C3", ) ax.set_ylim(0) ax.set_xlim(ax.get_xlim()[0] * 1.15, ax.get_xlim()[1] * 1.15) # Annotations and legend ax.text(0.01, 0.85, f"Mean: {mean:.2f}", fontdict=font_kws, transform=ax.transAxes) ax.text(0.01, 0.7, f"Std. dev: {std:.2f}", fontdict=font_kws, transform=ax.transAxes) ax.text( 0.01, 0.55, f"Skew: {scipy.stats.skew(col_data):.2f}", fontdict=font_kws, transform=ax.transAxes, ) ax.text( 0.01, 0.4, f"Kurtosis: {scipy.stats.kurtosis(col_data):.2f}", # Excess Kurtosis fontdict=font_kws, transform=ax.transAxes, ) ax.text( 0.01, 0.25, f"Count: {len(col_data)}", fontdict=font_kws, transform=ax.transAxes, ) ax.legend(loc="upper right") return ax
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') #target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) #target_names = None if production_data is not None and target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) array_prediction = production_data[prediction_column].to_numpy() prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [prediction_column[x] for x in prediction_ids] #plot support bar graphs = [] for label in prediction_column: pred_distr = ff.create_distplot([ production_data[production_data[target_column] == label] [label], production_data[ production_data[target_column] != label][label] ], [str(label), "other"], colors=[red, grey], bin_size=0.05, show_curve=False, show_rug=True) pred_distr.update_layout(xaxis_title="Probability", yaxis_title="Share", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)) pred_distr_json = json.loads(pred_distr.to_json()) graphs.append({ "id": "tab_" + str(label), "title": str(label), "graph": { "data": pred_distr_json["data"], "layout": pred_distr_json["layout"], } }) self.wi = BaseWidgetInfo( title=self.title, type="tabbed_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={"graphs": graphs}, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if target_column is not None and prediction_column is not None: reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) #plot output correlations abs_perc_error_time = go.Figure() abs_perc_error = list( map( lambda x: 100 * abs(x[0] - x[1]) / x[0], zip(reference_data[target_column], reference_data[prediction_column]))) error_trace = go.Scatter(x=reference_data[date_column] if date_column else reference_data.index, y=abs_perc_error, mode='lines', name='Absolute Percentage Error', marker=dict(size=6, color=red)) zero_trace = go.Scatter( x=reference_data[date_column] if date_column else reference_data.index, y=[0] * reference_data.shape[0], mode='lines', opacity=0.5, marker=dict( size=6, color='green', ), showlegend=False, ) abs_perc_error_time.add_trace(error_trace) abs_perc_error_time.add_trace(zero_trace) abs_perc_error_time.update_layout( xaxis_title="Timestamp" if date_column else "Index", yaxis_title="Percent", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)) abs_perc_error_time_json = json.loads( abs_perc_error_time.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "data": abs_perc_error_time_json['data'], "layout": abs_perc_error_time_json['layout'] }, additionalGraphs=[], ) else: self.wi = None
def get_numerical_cols(data: pd.DataFrame) -> pd.Index: numerical_columns: pd.Index = data.select_dtypes(exclude='object').columns return numerical_columns
def _impute_data(df: pd.DataFrame): for float_col in df.select_dtypes('float64'): df[float_col].fillna(df[float_col].mean(), inplace=True) for col in df.columns: df[col].fillna(df[col].mode().iloc[0], inplace=True)
def set_categories(df: pandas.DataFrame, column_categories: Dict[str, pandas.Categorical]): for c in df.select_dtypes(include='category').columns: df[c].cat.set_categories(column_categories[c].categories, inplace=True)
def correlation_analysis( data: pd.DataFrame, col_list=None, row_list=None, check_norm=False, method: str = "pearson", dropna: str = "pairwise", permutation_test: bool = False, n_permutations: int = 1000, random_state=None, ): """Run correlations for numerical features and return output in different formats Different methods to compute correlations and to handle missing values are implemented. Inspired by `researchpy.corr_case` and `researchpy.corr_pair`. Parameters ---------- data : pd.DataFrame Dataframe with variables in columns, cases in rows row_list: list or None (default: None) List with names of columns in `data` that should be in the rows of the correlogram. If None, all columns are used but only every unique combination. col_list: list or None (default: None) List with names of columns in `data` that should be in the columns of the correlogram. If None, all columns are used and only every unique combination. check_norm: bool (default: False) If True, normality will be checked for columns in `data` using `normal_check`. This influences the used method for correlations, i.e. Pearson or Spearman. Note: normality check ignores missing values. method: {'pearson', 'kendall', 'spearman'}, default 'pearson' Type of correlation, either Pearson's r, Spearman's rho, or Kendall's tau, implemented via respectively `scipy.stats.pearsonr`, `scipy.stats.spearmanr`, and `scipy.stats.kendalltau` Will be ignored if check_norm=True. Instead, Person's r is used for every combination of normally distributed columns and Spearman's rho is used for all other combinations. dropna : {'listwise', 'pairwise'}, default 'pairwise' Should rows with missing values be dropped over the complete `data` ('listwise') or for every correlation separately ('pairwise') permutation_test: bool (default: False) If true, a permutation test will added n_permutations: int (default: 1000) Number of permutations in the permutation test random_state: None or int (default: None) Random state for permutation_test. If not None, random_state will be updated for every permutation plot_permutation: bool (default: False) Whether to plot the results of the permutation test figsize: tuple (default: (11.7, 8.27)) Width and height of the figure in inches Returns ------- result_dict: dict Dictionary containing with the following keys: info: pd.DataFrame Description of correlation method, missing values handling and number of observations r-values: pd.DataFrame Dataframe with correlation coefficients. Indices and columns are column names from `data`. Only lower triangle is filled. p-values: pd.DataFrame Dataframe with p-values. Indices and columns are column names from `data`. Only lower triangle is filled. N: pd.DataFrame Dataframe with numbers of observations. Indices and columns are column names from `data`. Only lower triangle is filled. If dropna ='listwise', every correlation will have the same number of observations. summary: pd.DataFrame Dataframe with columns ['analysis', 'feature1', 'feature2', 'r-value', 'p-value', 'N', 'stat-sign'] which indicate the type of test used for the correlation, the pair of columns, the correlation coefficient, the p-value, the number of observations for each combination of columns in `data` and whether the r-value is statistically significant. plotted_permuations: Figure Examples -------- >>> from jmspack.frequentist_statistics import correlation_analysis >>> import seaborn as sns >>> iris = sns.load_dataset('iris') >>> dict_results = correlation_analysis(iris, method='pearson', dropna='listwise', permutation_test=True, ... n_permutations=100, check_norm=True) >>> dict_results['summary'] References ---------- Bryant, C (2018). researchpy's documentation [Revision 9ae5ed63]. Retrieved from https://researchpy.readthedocs.io/en/latest/ """ # Settings test if method == "pearson": test, test_name = stats.pearsonr, "Pearson" elif method == "spearman": test, test_name = stats.spearmanr, "Spearman Rank" elif method == "kendall": test, test_name = stats.kendalltau, "Kendall's Tau-b" else: raise ValueError("method not in {'pearson', 'kendall', 'spearman'}") # Copy numerical data from the original data data = data.copy().select_dtypes("number") # Get correct lists if col_list and not row_list: row_list = data.select_dtypes("number").drop(col_list, axis=1).columns.tolist() elif row_list and not col_list: col_list = data.select_dtypes("number").drop(row_list, axis=1).columns.tolist() # Initializing dataframes to store results info = pd.DataFrame() summary = pd.DataFrame() if not col_list and not row_list: r_vals = pd.DataFrame(columns=data.columns, index=data.columns) p_vals = pd.DataFrame(columns=data.columns, index=data.columns) n_vals = pd.DataFrame(columns=data.columns, index=data.columns) iterator = combinations(data.columns, 2) else: r_vals = pd.DataFrame(columns=col_list, index=row_list) p_vals = pd.DataFrame(columns=col_list, index=row_list) n_vals = pd.DataFrame(columns=col_list, index=row_list) iterator = product(col_list, row_list) if dropna == "listwise": # Remove rows with missing values data = data.dropna(how="any", axis="index") info = info.append( { f"{test_name} correlation test using {dropna} deletion": f"Total observations used = {len(data)}" }, ignore_index=True, ) elif dropna == "pairwise": info = info.append( { f"{test_name} correlation test using {dropna} deletion": f"Observations in the data = {len(data)}" }, ignore_index=True, ) else: raise ValueError("dropna not in {'listwise', 'pairwise'}") if check_norm: # Check normality of all columns in the data df_normality = normal_check(data) norm_names = df_normality.loc[df_normality["normality"], "feature"].tolist() # Iterating through the Pandas series and performing the correlation for col1, col2 in iterator: if dropna == "pairwise": # Remove rows with missing values in the pair of columns test_data = data[[col1, col2]].dropna() else: test_data = data if check_norm: # Select Pearson's r only if both columns are normally distributed if (col1 in norm_names) and (col2 in norm_names): test, test_name = stats.pearsonr, "Pearson" else: test, test_name = stats.spearmanr, "Spearman Rank" # Run correlations r_value, p_value = test(test_data.loc[:, col1], test_data.loc[:, col2]) n_value = len(test_data) # Store output in matrix format try: r_vals.loc[col2, col1] = r_value p_vals.loc[col2, col1] = p_value n_vals.loc[col2, col1] = n_value except KeyError: r_vals.loc[col1, col2] = r_value p_vals.loc[col1, col2] = p_value n_vals.loc[col1, col2] = n_value # Store output in dataframe format dict_summary = { "analysis": test_name, "feature1": col1, "feature2": col2, "r-value": r_value, "p-value": p_value, "stat-sign": (p_value < 0.05), "N": n_value, } if permutation_test: raise ValueError("permutation_test has yet to be implemented") # # Copy the complete data # col2_shuffle = np.array(test_data.loc[:, col2]) # col2_shuffle = np.repeat( # col2_shuffle[:, np.newaxis], n_permutations, axis=1 # ) # # Shuffle within the columns # np.random.seed(random_state) # ix_i = np.random.sample(col2_shuffle.shape).argsort(axis=0) # ix_j = np.tile(np.arange(col2_shuffle.shape[1]), (col2_shuffle.shape[0], 1)) # col2_shuffle = col2_shuffle[ix_i, ix_j] # permutations = np.apply_along_axis( # permute_test, # axis=0, # arr=col2_shuffle, # test_type="correlation", # test=test, # a2=np.array(test_data.loc[:, col1]), # ) # # extreme_permutation = np.where(permutations < p_value, 1, 0) # p_permutation = extreme_permutation.sum() / len(permutations) # dict_summary["permutation-p-value"] = p_permutation # # # Reset random seed numpy # np.random.seed(None) summary = pd.concat( [summary, pd.DataFrame(data=dict_summary, index=[0])], axis=0, ignore_index=True, sort=False, ) # Embed results within a dictionary result_dict = { "r-value": r_vals, "p-value": p_vals, "N": n_vals, "info": info, "summary": summary, } return result_dict
def check_dataframe(self, dataframe: pd.DataFrame) -> pd.DataFrame: dataframe = super().check_dataframe(dataframe=dataframe) dataframe["luid"] = dataframe["filepath"] for column in dataframe.select_dtypes("number").columns: dataframe[column] = dataframe[column].map(str) return dataframe
def optimize_ints(df: pd.DataFrame) -> pd.DataFrame: ints = df.select_dtypes(include=['int64']).columns.tolist() df[ints] = df[ints].apply(pd.to_numeric, downcast='integer') return df
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') #target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) #target_names = None if target_column is not None and prediction_column is not None: reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) #array_prediction = reference_data[prediction_column].to_numpy() #prediction_ids = np.argmax(array_prediction, axis=-1) #prediction_labels = [prediction_column[x] for x in prediction_ids] if len(prediction_column) <= 2: binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) binaraized_target = pd.DataFrame( binaraizer.transform(reference_data[target_column])) binaraized_target.columns = ['target'] p, r, thrs = metrics.precision_recall_curve( binaraized_target, reference_data[prediction_column[0]]) fig = go.Figure() fig.add_trace( go.Scatter(x=p, y=r, mode='lines', name='PR', marker=dict( size=6, color=red, ))) fig.update_layout(yaxis_title="Precision", xaxis_title="Recall", showlegend=True) fig_json = json.loads(fig.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1 if production_data is not None else 2, params={ "data": fig_json['data'], "layout": fig_json['layout'] }, additionalGraphs=[], ) else: binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) binaraized_target = pd.DataFrame( binaraizer.transform(reference_data[target_column])) binaraized_target.columns = prediction_column #plot support bar graphs = [] for label in prediction_column: p, r, thrs = metrics.precision_recall_curve( binaraized_target[label], reference_data[label]) fig = go.Figure() fig.add_trace( go.Scatter(x=p, y=r, mode='lines', name='PR', marker=dict( size=6, color=red, ))) fig.update_layout(yaxis_title="Precision", xaxis_title="Recall", showlegend=True) fig_json = json.loads(fig.to_json()) graphs.append({ "id": "tab_" + str(label), "title": str(label), "graph": { "data": fig_json["data"], "layout": fig_json["layout"], } }) self.wi = BaseWidgetInfo( title=self.title, type="tabbed_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1 if production_data is not None else 2, params={"graphs": graphs}, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) target_names = None if production_data is not None and target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) #plot support bar metrics_matrix = metrics.classification_report( production_data[target_column], production_data[prediction_column], output_dict=True) metrics_frame = pd.DataFrame(metrics_matrix) support = metrics_frame.iloc[-1:, :-3].values[0] fig = go.Figure() fig.add_trace( go.Bar(x=target_names if target_names else metrics_frame.columns.tolist()[:-3], y=metrics_frame.iloc[-1:, :-3].values[0], marker_color=red, name='Support')) fig.update_layout( xaxis_title="Class", yaxis_title="Number of Objects", ) support_bar_json = json.loads(fig.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "data": support_bar_json['data'], "layout": support_bar_json['layout'] }, additionalGraphs=[], ) else: self.wi = None
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) #set params data params_data = [] drifted_fetures_count = 0 #plt.ioff() for feature_name in num_feature_names: # + cat_feature_names: #feature_names: prod_small_hist = np.histogram( production_data[feature_name][np.isfinite( production_data[feature_name])], bins=10, density=True) ref_small_hist = np.histogram( reference_data[feature_name][np.isfinite( reference_data[feature_name])], bins=10, density=True) feature_type = 'num' p_value = ks_2samp(reference_data[feature_name], production_data[feature_name])[1] distr_sim_test = "Detected" if p_value < 0.05 else "Not Detected" drifted_fetures_count += 1 if p_value < 0.05 else 0 params_data.append({ "details": { "parts": [{ "title": "Data drift", "id": feature_name + "_drift", "type": "widget" }, { "title": "Data distribution", "id": feature_name + "_distr" }], "insights": [] }, "f1": feature_name, "f6": feature_type, "f3": { "x": list(ref_small_hist[1]), "y": list(ref_small_hist[0]) }, "f4": { "x": list(prod_small_hist[1]), "y": list(prod_small_hist[0]) }, "f2": distr_sim_test, "f5": round(p_value, 6) }) for feature_name in cat_feature_names: #feature_names: prod_small_hist = np.histogram( production_data[feature_name][np.isfinite( production_data[feature_name])], bins=10, density=True) ref_small_hist = np.histogram( reference_data[feature_name][np.isfinite( reference_data[feature_name])], bins=10, density=True) feature_type = 'cat' #p_value = ks_2samp(reference_data[feature_name], production_data[feature_name])[1] #CHI2 to be implemented for cases with different categories ref_feature_vc = reference_data[feature_name][np.isfinite( reference_data[feature_name])].value_counts() prod_feature_vc = production_data[feature_name][np.isfinite( production_data[feature_name])].value_counts() keys = set( list(reference_data[feature_name][np.isfinite( reference_data[feature_name])].unique()) + list(production_data[feature_name][np.isfinite( production_data[feature_name])].unique())) ref_feature_dict = dict.fromkeys(keys, 0) for key, item in zip(ref_feature_vc.index, ref_feature_vc.values): ref_feature_dict[key] = item prod_feature_dict = dict.fromkeys(keys, 0) for key, item in zip(prod_feature_vc.index, prod_feature_vc.values): prod_feature_dict[key] = item f_exp = [value[1] for value in sorted(ref_feature_dict.items())] f_obs = [value[1] for value in sorted(prod_feature_dict.items())] p_value = chisquare(f_exp, f_obs)[1] distr_sim_test = "Detected" if p_value < 0.05 else "Not Detected" drifted_fetures_count += 1 if p_value < 0.05 else 0 params_data.append({ "details": { "parts": [{ "title": "Data drift", "id": feature_name + "_drift", "type": "widget" }, { "title": "Data distribution", "id": feature_name + "_distr" }], "insights": [] }, "f1": feature_name, "f6": feature_type, "f3": { "x": list(ref_small_hist[1]), "y": list(ref_small_hist[0]) }, "f4": { "x": list(prod_small_hist[1]), "y": list(prod_small_hist[0]) }, "f2": distr_sim_test, "f5": round(p_value, 6) }) #set additionalGraphs additional_graphs_data = [] for feature_name in num_feature_names + cat_feature_names: #feature_names: #plot distributions fig = go.Figure() fig.add_trace( go.Histogram(x=reference_data[feature_name], marker_color=grey, opacity=0.6, nbinsx=10, name='Reference', histnorm='probability')) fig.add_trace( go.Histogram(x=production_data[feature_name], marker_color=red, opacity=0.6, nbinsx=10, name='Production', histnorm='probability')) fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), xaxis_title=feature_name, yaxis_title="Share") distr_figure = json.loads(fig.to_json()) #plot drift reference_mean = np.mean(reference_data[feature_name][np.isfinite( reference_data[feature_name])]) reference_std = np.std(reference_data[feature_name][np.isfinite( reference_data[feature_name])], ddof=1) x_title = "Timestamp" if date_column else "Index" fig = go.Figure() fig.add_trace( go.Scatter(x=production_data[date_column] if date_column else production_data.index, y=production_data[feature_name], mode='markers', name='Production', marker=dict(size=6, color=grey))) fig.update_layout( xaxis_title=x_title, yaxis_title=feature_name, showlegend=True, legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), shapes=[ dict( type="rect", # x-reference is assigned to the x-values xref="paper", # y-reference is assigned to the plot paper [0,1] yref="y", x0=0, y0=reference_mean - reference_std, x1=1, y1=reference_mean + reference_std, fillcolor="LightGreen", opacity=0.5, layer="below", line_width=0, ), dict( type="line", name='Reference', xref="paper", yref="y", x0=0, #min(testset_agg_by_date.index), y0=reference_mean, x1=1, #max(testset_agg_by_date.index), y1=reference_mean, line=dict(color="Green", width=3)), ]) drift_figure = json.loads(fig.to_json()) #add distributions data additional_graphs_data.append( AdditionalGraphInfo(feature_name + '_distr', { "data": distr_figure['data'], "layout": distr_figure['layout'] })) #add drift data additional_graphs_data.append( AdditionalGraphInfo( feature_name + '_drift', { "title": feature_name + "drift", "size": 2, "text": "", "type": "big_graph", "params": { "data": drift_figure['data'], "layout": drift_figure['layout'] } })) self.wi = BaseWidgetInfo( title="Data Drift: drift detected for " + str(drifted_fetures_count) + " out of " + str(len(num_feature_names) + len(cat_feature_names)) + " features", type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "rowsPerPage": min(len(num_feature_names) + len(cat_feature_names), 10), "columns": [{ "title": "Feature", "field": "f1" }, { "title": "Type", "field": "f6" }, { "title": "Reference Distribution", "field": "f3", "type": "histogram", "options": { "xField": "x", "yField": "y" } }, { "title": "Production Distribution", "field": "f4", "type": "histogram", "options": { "xField": "x", "yField": "y" } }, { "title": "Data drift", "field": "f2" }, { "title": "P-Value for Similarity Test", "field": "f5", "sort": "asc" }], "data": params_data }, additionalGraphs=additional_graphs_data)
class DataWorker(object): def feat_value2int(series): all_values = list(enumerate(np.unique(series))) value_dict = {name : i for i,name in all_values} return value_dict def __init__(self,data=None): """ Init DataWorker with pandas.DataFrame Otherwise make sure that the rdata can be transformed to DataFrame. """ if data is None: self.__data = {} if isinstance(data,DataFrame): self.__data = data.copy() else: self.__data = DataFrame(data) self.__featureDict = None @property def featureDict(self): self.__data.select_dtypes(include=['object']) @featureDict.setter def featureDict(self,value): pass @property def data(self): return self.__data @data.setter def data(self,df): self.data = df def getColNamesWithNan(self): s = self.__data.isnull().any() return s.index[s==True].tolist() def dataClean(self,transDict = None,fillna={'all':'most_frequent'},yCol = -1): """ yCol: the col you wanna predict fillna: {columnn:method_name} dictionary default:{'all':'most_frequent'} provied functions are : 'most_frequent','mean','median','first_n_frequent,n'(where the last n is a number) when key =='all' : fill column which include na with the same function, this key is suggested to put at the end """ # try to map all data to numeric self.__data = cd.fillna(self.__data,fillna) if transDict == None: self.__featureDict if yCol != -1: self.__data = cd.change_yCol(self.__data,yCol) def algorithmUsing(): pass def showFeagure(): pass def getResult(): pass
def transform(self, X: pd.DataFrame) -> pd.DataFrame: if self.columns is None: self.columns = list(X.select_dtypes('number').columns) raise NotImplementedError
def test_select_dtypes_empty(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))}) with tm.assertRaisesRegexp( ValueError, 'at least one of include or ' 'exclude must be nonempty'): df.select_dtypes()
def _get_date_columns(dataframe: pd.DataFrame): return dataframe.select_dtypes(include=[np.datetime64]).columns.values
def get_text_categorical_columns(df: pd.DataFrame) -> List[str]: return df.select_dtypes(exclude=['int', 'float']).columns
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if production_data is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) ref_error = reference_data[prediction_column] - reference_data[ target_column] prod_error = production_data[prediction_column] - production_data[ target_column] ref_quntile_5 = np.quantile(ref_error, .05) ref_quntile_95 = np.quantile(ref_error, .95) prod_quntile_5 = np.quantile(prod_error, .05) prod_quntile_95 = np.quantile(prod_error, .95) #create subplots reference_data['dataset'] = 'Reference' reference_data['Error bias'] = list( map( lambda x: 'Underestimation' if x <= ref_quntile_5 else 'Majority' if x < ref_quntile_95 else 'Overestimation', ref_error)) production_data['dataset'] = 'Production' production_data['Error bias'] = list( map( lambda x: 'Underestimation' if x <= prod_quntile_5 else 'Majority' if x < prod_quntile_95 else 'Overestimation', prod_error)) merged_data = pd.concat([reference_data, production_data]) reference_data.drop(['dataset', 'Error bias'], axis=1, inplace=True) production_data.drop(['dataset', 'Error bias'], axis=1, inplace=True) params_data = [] additional_graphs_data = [] for feature_name in num_feature_names: feature_type = 'num' ref_overal_value = np.mean(reference_data[feature_name]) ref_under_value = np.mean( reference_data[ref_error <= ref_quntile_5][feature_name]) ref_expected_value = np.mean( reference_data[(ref_error > ref_quntile_5) & (ref_error < ref_quntile_95)][feature_name]) ref_over_value = np.mean( reference_data[ref_error >= ref_quntile_95][feature_name]) ref_range_value = 0 if ref_over_value == ref_under_value else 100 * abs( ref_over_value - ref_under_value) / (np.max(reference_data[feature_name]) - np.min(reference_data[feature_name])) prod_overal_value = np.mean(production_data[feature_name]) prod_under_value = np.mean(production_data[ prod_error <= prod_quntile_5][feature_name]) prod_expected_value = np.mean(production_data[ (prod_error > prod_quntile_5) & (prod_error < prod_quntile_95)][feature_name]) prod_over_value = np.mean(production_data[ prod_error >= prod_quntile_95][feature_name]) prod_range_value = 0 if prod_over_value == prod_under_value else 100 * abs( prod_over_value - prod_under_value) / ( np.max(production_data[feature_name]) - np.min(production_data[feature_name])) feature_hist = px.histogram( merged_data, x=feature_name, color='Error bias', facet_col="dataset", histnorm='percent', barmode='overlay', category_orders={ "dataset": ["Reference", "Production"], "Error bias": ["Underestimation", "Overestimation", "Majority"] }) feature_hist_json = json.loads(feature_hist.to_json()) params_data.append({ "details": { "parts": [{ "title": "Error bias", "id": feature_name + "_hist" }], "insights": [] }, "f1": feature_name, "f2": feature_type, "f3": round(ref_expected_value, 2), "f4": round(ref_under_value, 2), "f5": round(ref_over_value, 2), "f6": round(ref_range_value, 2), "f7": round(prod_expected_value, 2), "f8": round(prod_under_value, 2), "f9": round(prod_over_value, 2), "f10": round(prod_range_value, 2) }) additional_graphs_data.append( AdditionalGraphInfo( feature_name + '_hist', { "data": feature_hist_json['data'], "layout": feature_hist_json['layout'] })) for feature_name in cat_feature_names: feature_type = 'cat' ref_overal_value = reference_data[feature_name].value_counts( ).idxmax() ref_under_value = reference_data[ref_error <= ref_quntile_5][ feature_name].value_counts().idxmax() #ref_expected_value = reference_data[(ref_error > ref_quntile_5) & (ref_error < ref_quntile_95)][feature_name].value_counts().idxmax() ref_over_value = reference_data[ref_error >= ref_quntile_95][ feature_name].value_counts().idxmax() ref_range_value = 1 if (ref_overal_value != ref_under_value) or (ref_over_value != ref_overal_value) \ or (ref_under_value != ref_overal_value) else 0 prod_overal_value = production_data[feature_name].value_counts( ).idxmax() prod_under_value = production_data[ prod_error <= prod_quntile_5][feature_name].value_counts( ).idxmax() #prod_expected_value = production_data[(prod_error > prod_quntile_5) & (prod_error < prod_quntile_95)][feature_name].value_counts().idxmax() prod_over_value = production_data[ prod_error >= prod_quntile_95][feature_name].value_counts( ).idxmax() prod_range_value = 1 if (prod_overal_value != prod_under_value) or (prod_over_value != prod_overal_value) \ or (prod_under_value != prod_overal_value) else 0 feature_hist = px.histogram( merged_data, x=feature_name, color='Error bias', facet_col="dataset", histnorm='percent', barmode='overlay', category_orders={ "dataset": ["Reference", "Production"], "Error bias": ["Underestimation", "Overestimation", "Majority"] }) feature_hist_json = json.loads(feature_hist.to_json()) params_data.append({ "details": { "parts": [{ "title": "Error bias", "id": feature_name + "_hist" }], "insights": [] }, "f1": feature_name, "f2": feature_type, "f3": str(ref_overal_value), "f4": str(ref_under_value), "f5": str(ref_over_value), "f6": str(ref_range_value), "f7": str(prod_overal_value), "f8": str(prod_under_value), "f9": str(prod_over_value), "f10": int(prod_range_value) }) additional_graphs_data.append( AdditionalGraphInfo( feature_name + '_hist', { "data": feature_hist_json['data'], "layout": feature_hist_json['layout'] })) self.wi = BaseWidgetInfo( title=self.title, type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "rowsPerPage": min(len(num_feature_names) + len(cat_feature_names), 10), "columns": [{ "title": "Feature", "field": "f1" }, { "title": "Type", "field": "f2" }, { "title": "REF: Majority", "field": "f3" }, { "title": "REF: Under", "field": "f4" }, { "title": "REF: Over", "field": "f5" }, { "title": "REF: Range(%)", "field": "f6" }, { "title": "PROD: Majority", "field": "f7" }, { "title": "PROD: Under", "field": "f8" }, { "title": "PROD: Over", "field": "f9" }, { "title": "PROD: Range(%)", "field": "f10", "sort": "desc" }], "data": params_data }, additionalGraphs=additional_graphs_data) else: reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) error = reference_data[prediction_column] - reference_data[ target_column] quntile_5 = np.quantile(error, .05) quntile_95 = np.quantile(error, .95) reference_data['Error bias'] = reference_data['Error bias'] = list( map( lambda x: 'Underestimation' if x <= quntile_5 else 'Majority' if x < quntile_95 else 'Overestimation', error)) params_data = [] additional_graphs_data = [] for feature_name in num_feature_names: # + cat_feature_names: #feature_names: feature_type = 'num' ref_overal_value = np.mean(reference_data[feature_name]) ref_under_value = np.mean( reference_data[error <= quntile_5][feature_name]) #ref_expected_value = np.mean(reference_data[(error > quntile_5) & (error < quntile_95)][feature_name]) ref_over_value = np.mean( reference_data[error >= quntile_95][feature_name]) ref_range_value = 0 if ref_over_value == ref_under_value else 100 * abs( ref_over_value - ref_under_value) / (np.max(reference_data[feature_name]) - np.min(reference_data[feature_name])) hist = px.histogram( reference_data, x=feature_name, color='Error bias', histnorm='percent', barmode='overlay', category_orders={ "Error bias": ["Underestimation", "Overestimation", "Majority"] }) #hist_fig = px.histogram(reference_data, x=feature_name, color=target_column, facet_col="dataset", # category_orders={"dataset": ["Reference", "Production"]}) hist_figure = json.loads(hist.to_json()) params_data.append({ "details": { "parts": [{ "title": "Error bias", "id": feature_name + "_hist" }], "insights": [] }, "f1": feature_name, "f2": feature_type, "f3": round(ref_overal_value, 2), "f4": round(ref_under_value, 2), "f5": round(ref_over_value, 2), "f6": round(ref_range_value, 2) }) additional_graphs_data.append( AdditionalGraphInfo( feature_name + '_hist', { "data": hist_figure['data'], "layout": hist_figure['layout'] })) for feature_name in cat_feature_names: #feature_names: feature_type = 'cat' ref_overal_value = reference_data[feature_name].value_counts( ).idxmax() ref_under_value = reference_data[ error <= quntile_5][feature_name].value_counts().idxmax() #ref_expected_value = reference_data[(error > quntile_5) & (error < quntile_95)][feature_name].value_counts().idxmax() ref_over_value = reference_data[ error >= quntile_95][feature_name].value_counts().idxmax() ref_range_value = 1 if (ref_overal_value != ref_under_value) or (ref_over_value != ref_overal_value) \ or (ref_under_value != ref_overal_value) else 0 hist = px.histogram( reference_data, x=feature_name, color='Error bias', histnorm='percent', barmode='overlay', category_orders={ "Error bias": ["Underestimation", "Overestimation", "Majority"] }) #hist_fig = px.histogram(reference_data, x=feature_name, color=target_column, facet_col="dataset", # category_orders={"dataset": ["Reference", "Production"]}) hist_figure = json.loads(hist.to_json()) params_data.append({ "details": { "parts": [{ "title": "Error bias", "id": feature_name + "_hist" }], "insights": [] }, "f1": feature_name, "f2": feature_type, "f3": str(ref_overal_value), "f4": str(ref_under_value), "f5": str(ref_over_value), "f6": int(ref_range_value) }) additional_graphs_data.append( AdditionalGraphInfo( feature_name + '_hist', { "data": hist_figure['data'], "layout": hist_figure['layout'] })) reference_data.drop('Error bias', axis=1, inplace=True) self.wi = BaseWidgetInfo( title=self.title, type="big_table", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "rowsPerPage": min(len(num_feature_names) + len(cat_feature_names), 10), "columns": [{ "title": "Feature", "field": "f1" }, { "title": "Type", "field": "f2" }, { "title": "Majority", "field": "f3" }, { "title": "Underestimation", "field": "f4" }, { "title": "Overestimation", "field": "f5" }, { "title": "Range(%)", "field": "f6", "sort": "desc" }], "data": params_data }, additionalGraphs=additional_graphs_data)
def determine_numeric_features(df: pd.DataFrame) -> pd.Series: return df.select_dtypes(include={"int64", "float64"}).columns
def normalize(df: pd.DataFrame, scaler) -> pd.DataFrame: df_num = df.select_dtypes(include=[np.float, np.int]) df[list(df_num.columns)] = scaler.transform(df[list(df_num.columns)]) return df
def fit(self, X: pd.DataFrame, y=None): #data.select_dtypes(include=['float', 'int']) #WHAT #self.std = X.std() self.std = X.select_dtypes(include=['float', 'int']).std() self.columns = self.std.index.values return self
def impute_continuous_missing_values(dataframe: pd.DataFrame) -> pd.DataFrame: new_value = 0 continuous_columns = dataframe.select_dtypes(include='number').columns for column_name in continuous_columns: dataframe[column_name] = dataframe[column_name].fillna(new_value) return dataframe
def test_select_dtypes_empty(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))}) with tm.assert_raises_regex(ValueError, 'at least one of ' 'include or exclude ' 'must be nonempty'): df.select_dtypes()
def get_categorical_column_names(dataframe: pd.DataFrame) -> [str]: categorical_columns = list(dataframe.select_dtypes(include='object').columns) return categorical_columns
def optimize_floats(df: pd.DataFrame) -> pd.DataFrame: floats = df.select_dtypes(include=['float64']).columns.tolist() df[floats] = df[floats].apply(pd.to_numeric, downcast='float') return df
import pandas as pd