def test_get_numeric_data_preserve_dtype(self): # get the numeric data o = DataFrame({'A': [1, '2', 3.]}) result = o._get_numeric_data() expected = DataFrame(index=[0, 1, 2], dtype=object) self._compare(result, expected)
def test_get_numeric_data(self): # TODO(wesm): unused? intname = np.dtype(np.int_).name # noqa floatname = np.dtype(np.float_).name # noqa datetime64name = np.dtype('M8[ns]').name objectname = np.dtype(np.object_).name df = DataFrame( { 'a': 1., 'b': 2, 'c': 'foo', 'f': Timestamp('20010102') }, index=np.arange(10)) result = df.dtypes expected = Series([ np.dtype('float64'), np.dtype('int64'), np.dtype(objectname), np.dtype(datetime64name) ], index=['a', 'b', 'c', 'f']) assert_series_equal(result, expected) df = DataFrame( { 'a': 1., 'b': 2, 'c': 'foo', 'd': np.array([1.] * 10, dtype='float32'), 'e': np.array([1] * 10, dtype='int32'), 'f': np.array([1] * 10, dtype='int16'), 'g': Timestamp('20010102') }, index=np.arange(10)) result = df._get_numeric_data() expected = df.loc[:, ['a', 'b', 'd', 'e', 'f']] assert_frame_equal(result, expected) only_obj = df.loc[:, ['c', 'g']] result = only_obj._get_numeric_data() expected = df.loc[:, []] assert_frame_equal(result, expected) df = DataFrame.from_dict({ 'a': [1, 2], 'b': ['foo', 'bar'], 'c': [np.pi, np.e] }) result = df._get_numeric_data() expected = DataFrame.from_dict({'a': [1, 2], 'c': [np.pi, np.e]}) assert_frame_equal(result, expected) df = result.copy() result = df._get_numeric_data() expected = df assert_frame_equal(result, expected)
class GetNumericData: def setup(self): self.df = DataFrame(np.random.randn(10000, 25)) self.df['foo'] = 'bar' self.df['bar'] = 'baz' self.df = self.df._consolidate() def time_frame_get_numeric_data(self): self.df._get_numeric_data()
class GetNumericData: def setup(self): self.df = DataFrame(np.random.randn(10000, 25)) self.df["foo"] = "bar" self.df["bar"] = "baz" self.df = self.df._consolidate() def time_frame_get_numeric_data(self): self.df._get_numeric_data()
class GetNumericData(object): def setup(self): self.df = DataFrame(np.random.randn(10000, 25)) self.df['foo'] = 'bar' self.df['bar'] = 'baz' with warnings.catch_warnings(record=True): self.df = self.df.consolidate() def time_frame_get_numeric_data(self): self.df._get_numeric_data()
class GetNumericData(object): def setup(self): self.df = DataFrame(np.random.randn(10000, 25)) self.df['foo'] = 'bar' self.df['bar'] = 'baz' self.df = self.df._consolidate() def time_frame_get_numeric_data(self): self.df._get_numeric_data()
def test_get_numeric_data(self): # TODO(wesm): unused? intname = np.dtype(np.int_).name # noqa floatname = np.dtype(np.float_).name # noqa datetime64name = np.dtype("M8[ns]").name objectname = np.dtype(np.object_).name df = DataFrame( {"a": 1.0, "b": 2, "c": "foo", "f": Timestamp("20010102")}, index=np.arange(10), ) result = df.dtypes expected = Series( [ np.dtype("float64"), np.dtype("int64"), np.dtype(objectname), np.dtype(datetime64name), ], index=["a", "b", "c", "f"], ) assert_series_equal(result, expected) df = DataFrame( { "a": 1.0, "b": 2, "c": "foo", "d": np.array([1.0] * 10, dtype="float32"), "e": np.array([1] * 10, dtype="int32"), "f": np.array([1] * 10, dtype="int16"), "g": Timestamp("20010102"), }, index=np.arange(10), ) result = df._get_numeric_data() expected = df.loc[:, ["a", "b", "d", "e", "f"]] assert_frame_equal(result, expected) only_obj = df.loc[:, ["c", "g"]] result = only_obj._get_numeric_data() expected = df.loc[:, []] assert_frame_equal(result, expected) df = DataFrame.from_dict({"a": [1, 2], "b": ["foo", "bar"], "c": [np.pi, np.e]}) result = df._get_numeric_data() expected = DataFrame.from_dict({"a": [1, 2], "c": [np.pi, np.e]}) assert_frame_equal(result, expected) df = result.copy() result = df._get_numeric_data() expected = df assert_frame_equal(result, expected)
def basicInfoAnalysis(df:DataFrame): numeric_col_num = len(df._get_numeric_data().columns) object_col_num = len(df.select_dtypes(['object']).columns) categorical_col_num = len(df.select_dtypes(['category']).columns) bool_col_num = len(df.select_dtypes(['bool']).columns) print('# of numeric columns:', numeric_col_num) print('# of object columns:', object_col_num) print('# of category columns:', categorical_col_num) print('# of bool columns:', bool_col_num) if numeric_col_num != 0: print('*'*10+' Numeric Variable Insight '+'*'*10) print(df[df._get_numeric_data().columns].describe())
def plot_boxes(df: pd.DataFrame, cols: list = None, out_path: str = None, show_p: bool = True, return_p: bool = False, h: int = None, w: int = None, spacing: float = 0.05, theme: str = 'simple_white', renderer: str = 'browser', n_cols: int = 3, shared_yaxes: bool = True, cols_like: list = None): """plot box plots""" # get cols to plot if not cols: if cols_like: cols = get_cols_like(df, cols_like) else: cols = df._get_numeric_data().columns n_rows = math.ceil(len(cols) / n_cols) p = make_subplots(rows=n_rows, cols=n_cols, shared_yaxes=shared_yaxes, vertical_spacing=spacing, horizontal_spacing=spacing) # figure out what to plot where on the subplot axes_dict = dict() i = 0 for index, x in np.ndenumerate(np.zeros((n_cols, n_rows))): axes_dict[i] = index i += 1 # make each plot for i, col in enumerate(cols): p.add_trace(go.Box(name=col, y=df[col]), row=axes_dict[i][1]+1, col=axes_dict[i][0]+1) if h: p.update_layout(height=h) if w: p.update_layout(width=w) p.update_layout(template=theme) if out_path: plotly.offline.plot(p, filename=out_path, auto_open=False) if show_p: p.show(renderer=renderer) if return_p: return p
def vif(df: pd.DataFrame, dependent: str) -> pd.DataFrame: """Get Variance Inflation Factor for each feature in df via a simple, multiple regression. Arguments: df {pd.DataFrame} -- dataset dependent {str} -- column name of dependent feature in df Returns: pd.DataFrame -- DataFrame containing feature names and VIF measures. """ # https://etav.github.io/python/vif_factor_python.html df = df.dropna() df = df._get_numeric_data() #drop non-numeric cols #gather features features = "+".join(df.columns.drop(dependent).tolist()) # get y and X dataframes based on this regression: y, X = dmatrices('{} ~'.format(dependent) + features, df, return_type='dataframe') # For each X, calculate VIF and save in dataframe vif = pd.DataFrame() vif["VIF Factor"] = [ variance_inflation_factor(X.values, i) for i in range(X.shape[1]) ] vif["features"] = X.columns return vif.round(1)
def LinearReg_Term(data: pd.DataFrame): # Handle with negative term values # way no1 data = data._get_numeric_data() # <- this increase accuracy data[data < 0] = 0 # way no2 # data['profit'] = data['profit'] + 1 - data['profit'].min() # Log Transformation data['price'] = np.log1p(data['price_meter_sq']) data['profit'] = np.log1p(data['profit']) data['term'] = np.log1p(data['term']) # Create X and y for Linear Model training X = data[['profit', 'price_meter_sq']] y = data[['term']].values.ravel() # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # Create LinearModel and fitting reg = LinearRegression().fit(X, y) print("Term linear regression fitted", flush=True) return reg
def bin_data(data: pd.DataFrame, cols: Union[list, np.ndarray, tuple] = (), bins: Union[int, list, np.ndarray, dict] = 10, quantile: bool = False, retbins: bool = False): """ Index the input DataFrame given the bin_edges for the columns specified in cols. :param DataFrame data: input data :param list cols: list of columns with numeric data which needs to be indexed :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\ E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]} :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True) :returns: rebinned DataFrame :rtype: pandas.DataFrame """ if isinstance(bins, dict): for col in cols: if col not in bins: raise ValueError( 'column {0} is not included in bins dictionary.'.format( col)) # check for numeric bins for col in list(set(data._get_numeric_data().columns) - set(cols)): nuq = data[col].nunique() if (nuq > 0.9 * len(data)) or (nuq > 100): warnings.warn( "numeric variable {1:s} has {0:d} unique values. Are you sure you don't want to bin it?" .format(nuq, str(col)), Warning) binned_data = data.copy() if isinstance(bins, (list, np.ndarray)): xbins = bins bins_dict = {} for col in cols: if isinstance(bins, (int, float)): xbins = bin_edges(data[col].astype(float), int(bins), quantile=quantile) if isinstance(bins, dict): if isinstance(bins[col], (int, float)): xbins = bin_edges(data[col].astype(float), int(bins[col]), quantile=quantile) elif isinstance(bins[col], (list, np.ndarray)): xbins = bins[col] binned_data[col], bin_labels = bin_array( data[col].astype(float).values, xbins) if retbins: bins_dict[col] = bin_labels if retbins: return binned_data, bins_dict return binned_data
def _fit(self, X: pd.DataFrame, y: pd.DataFrame = None): self.mapping_ = self.mapping if self.auto_input: for col in X._get_numeric_data().columns: if col not in self.mapping_.keys(): self.mapping_[col] = self.numeric_input return self
def get_list_non_numerical_columns_of_df(dataframe: pd.DataFrame) -> List: """ Returns a list of column names, where this names corresponds to non numerical attributes. """ cols = dataframe.columns num_cols = dataframe._get_numeric_data().columns return list(set(cols) - set(num_cols))
def plot_mean_std(real: pd.DataFrame, fake: pd.DataFrame, ax=None, fname=None): """ Plot the means and standard deviations of each dataset. :param real: DataFrame containing the real data :param fake: DataFrame containing the fake data :param ax: Axis to plot on. If none, a new figure is made. :param fname: If not none, saves the plot with this file name. """ if ax is None: fig, ax = plt.subplots(1, 2, figsize=(10, 5)) fig.suptitle('Absolute Log Mean and STDs of numeric data\n', fontsize=16) ax[0].grid(True) ax[1].grid(True) real = real._get_numeric_data() fake = fake._get_numeric_data() real_mean = np.log(np.add(abs(real.mean()).values, 1e-5)) fake_mean = np.log(np.add(abs(fake.mean()).values, 1e-5)) min_mean = min(real_mean) - 1 max_mean = max(real_mean) + 1 line = np.arange(min_mean, max_mean) sns.lineplot(x=line, y=line, ax=ax[0]) sns.scatterplot(x=real_mean, y=fake_mean, ax=ax[0]) ax[0].set_title('Means of real and fake data') ax[0].set_xlabel('real data mean (log)') ax[0].set_ylabel('fake data mean (log)') real_std = np.log(np.add(real.std().values, 1e-5)) fake_std = np.log(np.add(fake.std().values, 1e-5)) min_std = min(real_std) - 1 max_std = max(real_std) + 1 line = np.arange(min_std, max_std) sns.lineplot(x=line, y=line, ax=ax[1]) sns.scatterplot(x=real_std, y=fake_std, ax=ax[1]) ax[1].set_title('Stds of real and fake data') ax[1].set_xlabel('real data std (log)') ax[1].set_ylabel('fake data std (log)') if fname is not None: plt.savefig(fname) if ax is None: plt.show()
def test_get_numeric_data(self): # TODO(wesm): unused? intname = np.dtype(np.int_).name # noqa floatname = np.dtype(np.float_).name # noqa datetime64name = np.dtype('M8[ns]').name objectname = np.dtype(np.object_).name df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'f': Timestamp('20010102')}, index=np.arange(10)) result = df.get_dtype_counts() expected = Series({'int64': 1, 'float64': 1, datetime64name: 1, objectname: 1}) result.sort_index() expected.sort_index() assert_series_equal(result, expected) df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'd': np.array([1.] * 10, dtype='float32'), 'e': np.array([1] * 10, dtype='int32'), 'f': np.array([1] * 10, dtype='int16'), 'g': Timestamp('20010102')}, index=np.arange(10)) result = df._get_numeric_data() expected = df.loc[:, ['a', 'b', 'd', 'e', 'f']] assert_frame_equal(result, expected) only_obj = df.loc[:, ['c', 'g']] result = only_obj._get_numeric_data() expected = df.loc[:, []] assert_frame_equal(result, expected) df = DataFrame.from_dict( {'a': [1, 2], 'b': ['foo', 'bar'], 'c': [np.pi, np.e]}) result = df._get_numeric_data() expected = DataFrame.from_dict({'a': [1, 2], 'c': [np.pi, np.e]}) assert_frame_equal(result, expected) df = result.copy() result = df._get_numeric_data() expected = df assert_frame_equal(result, expected)
def plot_hists(df: pd.DataFrame, cols: list = None, out_path: str = None, show_p: bool = True, return_p: bool = False, h: int = None, w: int = None, spacing: float = 0.05, theme: str = 'simple_white', renderer: str = 'browser', n_cols: int = 3, shared_yaxes: bool = True, cols_like: list = None, cumulative: bool = False, dim: str = None): """plot histogram""" # get cols to plot if not cols: if cols_like: cols = get_cols_like(df, cols_like) else: cols = df._get_numeric_data().columns n_rows = math.ceil(len(cols) / n_cols) p = make_subplots(rows=n_rows, cols=n_cols, shared_yaxes=shared_yaxes, vertical_spacing=spacing, horizontal_spacing=spacing) # figure out what to plot where on the subplot axes_dict = dict() i = 0 for index, x in np.ndenumerate(np.zeros((n_cols, n_rows))): axes_dict[i] = index i += 1 # make each plot for i, col in enumerate(cols): if dim: for dim_value in df[dim].unique(): p.add_trace( go.Histogram( name=f'{col} - {dim_value}', x=df[df[dim] == dim_value][col], cumulative_enabled=cumulative, bingroup=1, histnorm='probability density' ), row=axes_dict[i][1]+1, col=axes_dict[i][0]+1 ) p.update_layout(barmode='overlay') p.update_traces(opacity=0.5) else: p.add_trace( go.Histogram( name=col, x=df[col], cumulative_enabled=cumulative ), row=axes_dict[i][1] + 1, col=axes_dict[i][0] + 1 ) if h: p.update_layout(height=h) if w: p.update_layout(width=w) p.update_layout(template=theme) if out_path: plotly.offline.plot(p, filename=out_path, auto_open=False) if show_p: p.show(renderer=renderer) if return_p: return p
def test_get_numeric_data_extension_dtype(self): # GH 22290 df = DataFrame({ 'A': integer_array([-10, np.nan, 0, 10, 20, 30], dtype='Int64'), 'B': Categorical(list('abcabc')), 'C': integer_array([0, 1, 2, 3, np.nan, 5], dtype='UInt8'), 'D': IntervalArray.from_breaks(range(7))}) result = df._get_numeric_data() expected = df.loc[:, ['A', 'C']] assert_frame_equal(result, expected)
def col_numeric_cat_split(df: pd.DataFrame) -> list: """ takes in a pandas dataframe returns a list of numeric columns, and another list of categorical columns """ num_cols = df._get_numeric_data().columns cols = df.columns categorical_cols = list(set(cols) - set(num_cols)) num_cols = list(num_cols) return num_cols, categorical_cols
def _check_num_nans(self, data: pd.DataFrame) -> bool: """ Check Nans in numeric features in data Args: data (pd.DataFrame, shape (n_samples, n_features)): the input data Return: bool: True or False """ data = data._get_numeric_data() return len(list(data.columns[data.isnull().sum() > 0])) > 0
def test_get_X_columns(self): # numeric and object columns df = DataFrame({'a': [1, 2, 3], 'b': [True, False, True], 'c': ['foo', 'bar', 'baz'], 'd': [None, None, None], 'e': [3.14, 0.577, 2.773]}) tm.assert_index_equal(df._get_numeric_data().columns, pd.Index(['a', 'b', 'e']))
def get_n(recalculate: bool, data_df: pd.DataFrame, filepath: Optional[str] = None) -> pd.DataFrame: """Get sample sizes Parameters ---------- recalculate : If True, recalculate the sample sizes data_df : Original raw data as a dataframe filepath : If `recalculate==False`: read the correlation values from this file. If `recalculate==True`: write the correlation values to this file. If not provided, run the calculation and return the correlation data without writing it to a file. Returns ------- : A dataframe holding the sample sizes """ start = time() if recalculate or filepath is None: logger.info('Calculating sampling values') num_cols = data_df.shape[1] data_mat = data_df._get_numeric_data().to_numpy(dtype=float, na_value=np.nan, copy=False) n_mat = np.zeros((num_cols, num_cols)) group_start = time() for a_ix in range(num_cols): if a_ix % 100 == 0: print(a_ix, '%.2f' % (time() - group_start), 'sec per round,', int(time() - start), 'sec total') group_start = time() n_mat[a_ix, a_ix] = (~np.isnan(data_mat[:, a_ix])).sum() for b_ix in range(a_ix + 1, num_cols): n = (~np.isnan(data_mat[:, a_ix]) & ~np.isnan(data_mat[:, b_ix])).sum() n_mat[a_ix, b_ix] = n n_mat[b_ix, a_ix] = n data_n = pd.DataFrame(n_mat, index=data_df.columns, columns=data_df.columns) if filepath is not None: logger.info(f'Saving sampling matrix to {"%s.h5" % filepath}') data_n.to_hdf('%s.h5' % filepath, filepath.split('/')[-1]) else: logger.info(f'Reading sampling values from file {filepath}') data_n = pd.read_hdf('%s.h5' % filepath) elapsed = time() - start print(int(elapsed), 'sec') return data_n
def test_get_numeric_data_mixed_dtype(self): # numeric and object columns df = DataFrame({ "a": [1, 2, 3], "b": [True, False, True], "c": ["foo", "bar", "baz"], "d": [None, None, None], "e": [3.14, 0.577, 2.773], }) result = df._get_numeric_data() tm.assert_index_equal(result.columns, Index(["a", "b", "e"]))
def remove_negatives(micro_df: pd.DataFrame): """ Replaces negative values with NaN. Change df inplace. Parameters ---------- micro_df: pd.DataFrame """ numeric = micro_df._get_numeric_data() numeric.where(numeric >= 0, np.nan, inplace=True)
def test_get_numeric_data_extension_dtype(self): # GH 22290 df = DataFrame( { "A": integer_array([-10, np.nan, 0, 10, 20, 30], dtype="Int64"), "B": Categorical(list("abcabc")), "C": integer_array([0, 1, 2, 3, np.nan, 5], dtype="UInt8"), "D": IntervalArray.from_breaks(range(7)), } ) result = df._get_numeric_data() expected = df.loc[:, ["A", "C"]] assert_frame_equal(result, expected)
def test_get_X_columns(self): # numeric and object columns df = DataFrame({ "a": [1, 2, 3], "b": [True, False, True], "c": ["foo", "bar", "baz"], "d": [None, None, None], "e": [3.14, 0.577, 2.773], }) tm.assert_index_equal(df._get_numeric_data().columns, pd.Index(["a", "b", "e"]))
def fit(self, data: pd.DataFrame, cols: Optional[List[str]] = None) -> None: """ Parameters ---------- data : pd.DataFrame dataset (pd.DataFrame shape = (n_samples, n_features)) cols : Optional[List[str]], optional cols list features, by default None Returns ------- self Raises ------ Exception No numerical features """ if cols is not None: data = data[cols] data = data._get_numeric_data() self.columns = data.columns count_columns = len(self.columns) if count_columns < 1: raise ValueError("No numerical features") self.scaler = MinMaxScaler().fit(data) s_data = self.scaler.transform(data) units = 512 if count_columns > 512: units = count_columns self.autoencoder = self._get_dae(count_columns, units=units) self.autoencoder.fit( s_data, s_data, epochs=50, batch_size=124, shuffle=True, verbose=self.verbose, ) return self
def plot_lines(df: pd.DataFrame, cols: list = None, cols_like: list = None, x: str = None, h: int = 300, w: int = 1200, t_str: str = 'box_zoom,pan,hover,reset,save', x_type: str = 'datetime', show_p: bool = True, t_loc: str = 'right', out_path: str = None, return_p: bool = False, palette: str = 'Category20', p_theme: str = 'light_minimal', notebook: bool = False): """Plot lines. """ # get cols to plot if not cols: if cols_like: cols = get_cols_like(df, cols_like) else: cols = df._get_numeric_data().columns # define x axis if needed if not x: x = df.index.name # define source source = ColumnDataSource(df) # define palette if palette == 'Category20': p_palette = Category20[20] else: raise NotImplementedError(f'... palette {palette} not implemented ...') p = make_figure(h=h, w=w, x_type=x_type, t_loc=t_loc, t_str=t_str) for i, col in enumerate(cols): p.line(x, col, source=source, name=col, color=p_palette[i]) add_hover(p, cols) if out_path: output_file(out_path) curdoc().theme = p_theme if notebook: output_notebook() if show_p: show(p) if return_p: return p
def __init__(self, real: pd.DataFrame, fake: pd.DataFrame, cat_cols=None, unique_thresh=0, metric='pearsonr', verbose=False, n_samples=None, name: str = None): """ :param real: Real dataset (pd.DataFrame) :param fake: Synthetic dataset (pd.DataFrame) :param unique_thresh: Threshold for automatic evaluation if column is numeric :param cat_cols: The columns that are to be evaluated as discrete. If passed, unique_thresh is ignored. :param metric: the metric to use for evaluation linear relations. Pearson's r by default, but supports all models in scipy.stats :param verbose: Whether to print verbose output :param n_samples: Number of samples to evaluate. If none, it will take the minimal length of both datasets and cut the larger one off to make sure they are the same length. :param name: Name of the TableEvaluator. Used in some plotting functions like `helpers.plot_correlation_comparison` to indicate your model. """ self.name = name self.unique_thresh = unique_thresh self.real = real.copy() self.fake = fake.copy() self.comparison_metric = getattr(stats, metric) self.verbose = verbose if cat_cols is None: self.numerical_columns = [column for column in real._get_numeric_data().columns if len(real[column].unique()) > unique_thresh] self.categorical_columns = [column for column in real.columns if column not in self.numerical_columns] else: self.categorical_columns = cat_cols self.numerical_columns = [column for column in real.columns if column not in cat_cols] if n_samples is None: self.n_samples = min(len(self.real), len(self.fake)) elif len(fake) >= n_samples and len(real) >= n_samples: self.n_samples = n_samples else: raise Exception(f'Make sure n_samples < len(fake/real). len(real): {len(real)}, len(fake): {len(fake)}') self.real = self.real.sample(self.n_samples) self.fake = self.fake.sample(self.n_samples) assert len(self.real) == len(self.fake), f'len(real) != len(fake)' self.real.loc[:, self.categorical_columns] = self.real.loc[:, self.categorical_columns].fillna('[NAN]') self.fake.loc[:, self.categorical_columns] = self.fake.loc[:, self.categorical_columns].fillna('[NAN]') self.real.loc[:, self.numerical_columns] = self.real.loc[:, self.numerical_columns].fillna(self.real[self.numerical_columns].mean()) self.fake.loc[:, self.numerical_columns] = self.fake.loc[:, self.numerical_columns].fillna(self.fake[self.numerical_columns].mean())
def preprocess_numerical_data(data: pd.DataFrame, drop_cols: list, scaler_filename: str, fit=True): data_transformed: pd.DataFrame = pd.DataFrame([]) try: # Extract Numerical Features df_num: pd.DataFrame = data._get_numeric_data() df_num.drop(drop_cols, axis=1, inplace=True) # Scale data data_transformed: pd.DataFrame = FeatureExtraction.scale_data( data=df_num, fit=fit, filename=scaler_filename) # Add drop cols data_transformed: pd.DataFrame = pd.concat( [data[drop_cols], data_transformed], axis=1, sort=False) except Exception as e: logger.error(e) return data_transformed
def plot_heatmap(df: pd.DataFrame, cols: list = None, cols_like: list = None, id_vars: list = None, out_path: str = None, show_p: bool = True, return_p: bool = False, h: int = None, w: int = None, theme: str = 'plotly_white', renderer: str = 'browser', colorscale: str = 'RdBu', showscale: bool = False): """plot heatmap""" # get cols to plot if not cols: if cols_like: cols = get_cols_like(df, cols_like) else: cols = df._get_numeric_data().columns if not id_vars: id_vars = list(df.index.names) df = pd.melt(df.reset_index(), id_vars=id_vars, value_vars=cols) p = go.Figure(data=go.Heatmap(z=df['value'], x=df[','.join(id_vars)], y=df['variable'], colorscale=colorscale, showscale=showscale)) if h: p.update_layout(height=h) if w: p.update_layout(width=w) p.update_layout(template=theme) if out_path: plotly.offline.plot(p, filename=out_path, auto_open=False) if show_p: p.show(renderer=renderer) if return_p: return p
def fit(self, data: pd.DataFrame, cols: Optional[List[str]] = None) -> None: """ Parameters ---------- data : pd.DataFrame dataset (pd.DataFrame shape = (n_samples, n_features)) cols : Optional[List[str]], optional cols list features, by default None Returns ------- self """ if cols is not None: data = data[cols] data = data._get_numeric_data() if self.verbose: for col in data.columns: pct_missing = np.mean(data[col].isnull()) if pct_missing > 0.25: logger.warning("! Attention {} - {}% Nans!".format( col, round(pct_missing * 100))) self.nan_columns = list(data.columns[data.isnull().sum() > 0]) if not self.nan_columns: logger.info("No nans features") if self.method == "median": self.fill_value = data.median() elif self.method == "mean": self.fill_value = data.mean() else: raise ValueError("Wrong fill method") return self