def test_get_type_pandas(self): d = {"col1": [1, 2, 3], "col2": ["a", "b", "c"]} df = pd.DataFrame(data=d) col1_type = PreprocessingUtils.get_type(df["col1"]) self.assertNotEqual(col1_type, PreprocessingUtils.CATEGORICAL) col2_type = PreprocessingUtils.get_type(df["col2"]) self.assertNotEqual(col1_type, PreprocessingUtils.CATEGORICAL)
def _fit_na_fill(self, X): for column in self._columns: if np.sum(pd.isnull(X[column]) == True) == 0: continue self._na_fill_params[column] = self._get_fill_value(X[column]) if PreprocessingUtils.get_type(X[column]) == PreprocessingUtils.DATETIME: self._datetime_columns += [column]
def _get_fill_value(self, x): # categorical type if PreprocessingUtils.get_type(x) == PreprocessingUtils.CATEGORICAL: if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN: return ( PreprocessingMissingValues.MISSING_VALUE ) # add new categorical value return PreprocessingUtils.get_most_frequent(x) if PreprocessingUtils.get_type(x) == PreprocessingUtils.DATETIME: return PreprocessingUtils.get_most_frequent(x) # numerical type if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN: return PreprocessingUtils.get_min(x) - 1.0 if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MEAN: return PreprocessingUtils.get_mean(x) return PreprocessingUtils.get_median(x)
def _fit_categorical_convert(self, X): for column in self._columns: if PreprocessingUtils.get_type( X[column]) != PreprocessingUtils.CATEGORICAL: # no need to convert, already a number continue # limit categories - it is needed when doing one hot encoding # this code is also used in predict.py file # and transform_utils.py # TODO it needs refactoring !!! too_much_categories = len(np.unique(list(X[column].values))) > 200 lbl = None if (self._convert_method == PreprocessingCategorical.CONVERT_ONE_HOT and not too_much_categories): lbl = LabelBinarizer() lbl.fit(X, column) else: lbl = LabelEncoder() lbl.fit(X[column]) if lbl is not None: self._convert_params[column] = lbl.to_json()
def compute(X_train, y_train, eda_path): try: # check if exists if os.path.exists(eda_path): # probably the EDA analysis is already done # skip from here return else: # need to create directory for EDA analysis os.mkdir(eda_path) inform = defaultdict(list) if isinstance(y_train, pd.Series): if PreprocessingUtils.get_type(y_train) in ("categorical"): plt.figure(figsize=(5, 5)) sns.countplot(y_train, color=BLUE) plt.title("Target class distribution") plt.tight_layout(pad=2.0) plot_path = os.path.join(eda_path, "target.png") plt.savefig(plot_path) plt.close("all") else: plt.figure(figsize=(5, 5)) sns.distplot(y_train, color=BLUE) plt.title("Target class distribution") plt.tight_layout(pad=2.0) plot_path = os.path.join(eda_path, "target.png") plt.savefig(plot_path) plt.close("all") inform["missing"].append( pd.isnull(y_train).sum() / y_train.shape[0]) inform["unique"].append(y_train.nunique()) inform["feature_type"].append( PreprocessingUtils.get_type(y_train)) inform["plot"].append("![](target.png)") inform["feature"].append("target") inform["desc"].append(y_train.describe().to_dict()) for col in X_train.columns: inform["feature_type"].append( PreprocessingUtils.get_type(X_train[col])) if PreprocessingUtils.get_type(X_train[col]) in ( "categorical", "discrete", ): plt.figure(figsize=(5, 5)) chart = sns.countplot( X_train[col], order=X_train[col].value_counts().iloc[:10].index, color=BLUE, ) chart.set_xticklabels(chart.get_xticklabels(), rotation=90) plt.title(f"{col} class distribution") plt.tight_layout(pad=2.0) plot_path = os.path.join(eda_path, f"{col}.png") plt.savefig(plot_path) plt.close("all") elif PreprocessingUtils.get_type( X_train[col]) in ("continous"): plt.figure(figsize=(5, 5)) sns.distplot(X_train[col], color=BLUE) plt.title(f"{col} value distribution") plt.tight_layout(pad=2.0) plot_path = os.path.join(eda_path, f"{col}.png") plt.savefig(plot_path) plt.close("all") elif PreprocessingUtils.get_type(X_train[col]) in ("text"): plt.figure(figsize=(10, 10), dpi=70) word_string = " ".join(X_train[col].str.lower()) wordcloud = WordCloud( width=500, height=500, stopwords=STOPWORDS, background_color="white", max_words=400, max_font_size=None, ).generate(word_string) plt.imshow(wordcloud, aspect="auto", interpolation="nearest") plt.axis("off") plot_path = os.path.join(eda_path, f"{col}.png") plt.savefig(plot_path) elif PreprocessingUtils.get_type(X_train[col]) in ("datetime"): plt.figure(figsize=(5, 5)) pd.to_datetime(X_train[col]).plot(grid="True", color=BLUE) plt.tight_layout(pad=2.0) plot_path = os.path.join(eda_path, f"{col}.png") plt.savefig(plot_path) plt.close("all") inform["missing"].append( pd.isnull(X_train[col]).sum() * 100 / X_train.shape[0]) inform["unique"].append(int(X_train[col].nunique())) inform["plot"].append(f"![]({col}.png)") inform["feature"].append(str(col)) inform["desc"].append(X_train[col].describe().to_dict()) df = pd.DataFrame(inform) with open(os.path.join(eda_path, "README.md"), "w") as fout: for i, row in df.iterrows(): fout.write(f"## Feature : {row['feature']}\n") fout.write(f"- **Feature type** : {row['feature_type']}\n") fout.write(f"- **Missing** : {row['missing']}%\n") fout.write(f"- **Unique** : {row['unique']}\n") for key in row["desc"].keys(): if key in ("25%", "50%", "75%"): fout.write( f"- **{key.capitalize()}th Percentile** : {row['desc'][key]}\n" ) else: fout.write( f"- **{key.capitalize()}** :{row['desc'][key]}\n" ) fout.write(f"- {row['plot']}\n") fout.close() except Exception as e: logger.error(f"There was an issue when running EDA. {str(e)}")
def extensive_eda(X, y, save_path): # Check for empty dataframes in params if not isinstance(X, pd.DataFrame): raise ValueError("X should be a dataframe") if X.shape[0] != len(y): raise ValueError("X and y should have the same number of samples") if X.shape[1] > MAXCOL: X = X.iloc[:, :MAXCOL] warnings.warn( f"AutoML EDA column limit exceeded! running for first {MAXCOL} columns" ) if save_path: if not os.path.exists(save_path): os.mkdir(save_path) else: raise ValueError("Please provide a valid path to save the Extensive EDA") plt.style.use("ggplot") try: if PreprocessingUtils.get_type(y) in ("categorical", "discrete"): for col in X.columns: if PreprocessingUtils.get_type(X[col]) == "continous": plt.figure(figsize=(5, 5)) for i in np.unique(y): sns.kdeplot( x=X.iloc[np.where(y == i)[0]][col], label=f"class {i}", shade=True, ) plt.legend() plt.gca().set_title( f"Distribution of {col} for each class", fontsize=11, weight="bold", alpha=0.75, ) plt.savefig(EDA.plot_path(save_path, col + "_target")) elif PreprocessingUtils.get_type(X[col]) in ( "categorical", "discrete", ): if X[col].nunique() > 7: warnings.warn("Considering 7 the most frequent values") values = X[col].value_counts().index[:7] plt.figure(figsize=(5, 5)) sns.countplot( x=X[X[col].isin(values)][col], hue=y[X[col].isin(values)] ) plt.gca().set_title( f"Count plot of each {col}", fontsize=11, weight="bold", alpha=0.75, ) plt.savefig(EDA.plot_path(save_path, col + "_target")) elif PreprocessingUtils.get_type(y) == "continous": for col in X.columns: if PreprocessingUtils.get_type(X[col]) == "continous": plt.figure(figsize=(5, 5)) plt.scatter(X[col].values, y) plt.gca().set_xlabel(f"{col}") plt.gca().set_ylabel("target") plt.gca().set_title( f"Scatter plot of {col} vs target", fontsize=11, weight="bold", alpha=0.75, ) plt.savefig(EDA.plot_path(save_path, col + "_target")) elif PreprocessingUtils.get_type(X[col]) in ( "categorical", "discrete", ): if X[col].nunique() > 7: warnings.warn("Considering 7 the most frequent values") plt.figure(figsize=(5, 5)) for i in X[col].value_counts().index[:7]: sns.kdeplot( x=y[X[X[col] == i].index], shade=True, label=f"{col}_{i}", ) plt.gca().set_title( f"Distribution of target for each {col}", fontsize=11, weight="bold", alpha=0.75, ) plt.legend() plt.savefig(EDA.plot_path(save_path, col + "_target")) elif PreprocessingUtils.get_type(X[col]) == "datetime": plt.figure(figsize=(5, 5)) plt.plot(X[col], y) plt.gca().set_xticklabels(X[col].dt.date, rotation="45") plt.gca().set_title( f"Distribution of target over time", fontsize=11, weight="bold", alpha=0.75, ) plt.savefig(EDA.plot_path(save_path, col + "_target")) cols = [ col for col in X.columns if PreprocessingUtils.get_type(X[col]) == "continous" ][:COLS] if len(cols) > 0: plt.figure(figsize=(10, 10)) sns.heatmap(X[cols].corr()) plt.gca().set_title("Heatmap", fontsize=11, weight="bold", alpha=0.75) plt.savefig(os.path.join(save_path, "heatmap")) with open(os.path.join(save_path, "Extensive_EDA.md"), "w") as fout: for col in X.columns: fout.write(f"## Bivariate analysis of {col} feature with target\n") fout.write("\n![]({})\n".format(EDA.plot_fname(col + "_target"))) fout.write("\n") fout.write( "------------------------------------------------------\n" ) if len(cols) > 0: fout.write("## Heatmap\n") fout.write("![](heatmap.png)\n") fout.write("\n") fout.write( "------------------------------------------------------\n" ) except Exception as e: AutoMLException(e)
def test_get_type_numpy_number(self): tmp = np.array([1, 2, 3]) tmp_type = PreprocessingUtils.get_type(tmp) self.assertNotEqual(tmp_type, PreprocessingUtils.CATEGORICAL)
def test_get_type_numpy_categorical(self): tmp = np.array(["a", "b", "c"]) tmp_type = PreprocessingUtils.get_type(tmp) self.assertEqual(tmp_type, PreprocessingUtils.CATEGORICAL)