def plot_feature_pair(df, xname, yname, ax = None, legend = True, figsize = None, *args, **kwargs): """ Plot the 'scatter plot' of a pair of two features based on the types of features, e.g., 1. numberical vs numbercial - scatter plot with lowess 2. numericla vs categorical - density plot grouped by categorical vars 3. categorical vs categorical - stacked barchart (hexbin or confusion matrix plot) This will help spot useful features that are both common and have extreme patterns (for classification) df: DataFrame xname: name of feature x (usually an input feature of interest) yname: name of feature y (usually the output feature ) args, kwargs: plotting parameters """ if ax is None: fig, ax = plt.subplots(1, 1, figsize = figsize) x_dtype = "numerical" if is_numerical(df, xname) else "categorical" y_dtype = "numerical" if is_numerical(df, yname) else "categorical" x, y = df[xname], df[yname] if x_dtype is "numerical" and y_dtype is "numerical": ax.scatter(x, y, color = "blue", s = 10, marker = ".", *args, **kwargs) lowessy = sm.nonparametric.lowess(y, x, return_sorted = False) ax.plot(sorted(x), sorted(lowessy), "r-", label="lowess", alpha = 1) ax.set_xlabel("%s(%s)" % (xname, x_dtype)) ax.set_ylabel("%s(%s)" % (yname, y_dtype)) elif x_dtype is "numerical" and y_dtype is "categorical": for value, subdf in df.groupby(by = yname): if subdf.shape[0] > 1: subdf[xname].plot(kind = "density", label = value, ax = ax) ax.set_xlabel("%s|%s" % (xname, yname)) elif x_dtype is "categorical" and y_dtype is "numerical": for value, subdf in df.groupby(by = xname): if subdf.shape[0] > 1: subdf[yname].plot(kind = "density", label = value, ax = ax) ax.set_xlabel("%s|%s" % (yname, xname)) else: # categorical and categorical pd.crosstab(df[xname], df[yname], margins = False).plot(kind = 'barh', stacked = True, ax = ax) ax.set_xlabel("dist. of %s" % yname) if legend: ax.legend(loc = "best")
def find_numerical_features(df): return np.asarray([f for f in df.columns if is_numerical(df, f)])