def plot_cat_scatter(df, x_cols, y, format='png'): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) if y in x_cols: from util.exceptions import ConfigError raise ConfigError( 'The label "{}" was also configured as a feature in x_cols for function "plot_cat_scatter".' .format(y)) import seaborn as sns import matplotlib.ticker as ticker # TODO: check y is not in x_cols for col in x_cols: g = sns.catplot(x=col, y=y, data=df) ax = g.axes[0, 0] ax.set_xticklabels(df[col].unique().tolist(), rotation=90) N = df[col].nunique() plt.xticks(range(int(N))) plt.gca().margins(x=0) plt.gcf().canvas.draw() m = 0.1 margin = m / plt.gcf().get_size_inches()[0] plt.gcf().subplots_adjust(left=margin, right=1. - margin, bottom=0.333) plt.gcf().set_size_inches(30, 7) plt.savefig('./tmp/images/{}_to_{}_cat_scatter.{}'.format( to_file_save_name(col), to_file_save_name(y), format), format=format) plt.close(plt.gcf())
def train_val_test_split(df, label=None, split=[0.6, 0.2, 0.2], seed=None): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) if not np.sum(split) == 1.0: raise DataSplitError(np.sum(split)) df_new = df.copy() if seed == None or seed == 'None': df_new = shuffle(df_new) else: df_new = shuffle(df_new, random_state=seed) m = len(df_new) train_end = int(split[0] * m) test_end = int(split[1] * m) + train_end train = df_new[:train_end] test = df_new[train_end:test_end] val = df_new[test_end:] if label == None: return train.values, _, test.values, _, val.values, _ y_train = train[label].values X_train = train.drop([label], axis=1).values y_test = test[label].values X_test = test.drop([label], axis=1).values y_val = val[label].values X_val = val.drop([label], axis=1).values return X_train, y_train, X_test, y_test, X_val, y_val
def apply_binary_encoding(df, categorical_columns): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) import category_encoders as ce encoder = ce.BinaryEncoder(cols=categorical_columns).fit(df.values) X_transformed = encoder.transform(df) return X_transformed
def describe_data(df, name, n, include='all'): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) file_content = 'Data description for {}\n'.format(name) file_content += df.describe(include=include).T.to_string() file_content += '\n\nThe data set has {} columns.'.format(len(df.columns)) file_content += '\n\nThe data set has {} NaN values.'.format( df.isnull().values.sum()) file_content += '\n\nDataFrame data types:\n\n' file_content += df.dtypes.to_string() file_content += '\n\nDataFrame head{}:\n\n'.format(n) file_content += df.head(n).to_string() file_content += '\n\n DataFrame tail{}:\n\n'.format(n) file_content += df.tail(n).to_string() to_txt_with_versioning('./tmp/description/{}'.format(name), file_content)
def apply_weight_of_evidence_encoding(df, categorical_columns, label='y'): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) import category_encoders as ce encoder = ce.WOEEncoder(cols=categorical_columns).fit( df.drop([label], axis=1), df[label]) X_transformed = encoder.transform(df) return X_transformed
def drop_csv_column(df): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) if 'Unnamed: 0' in df.columns: df.drop(['Unnamed: 0'], axis='columns', inplace=True) if 'Unnamed: 0.1' in df.columns: df.drop(['Unnamed: 0.1'], axis='columns', inplace=True) return df
def apply_sum_encoding(df, categorical_columns): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) import category_encoders as ce encoder = ce.SumEncoder(cols=categorical_columns).fit(df.values) X_transformed = encoder.transform(df) X.drop(['intercept'], inplace=True, axis=1) return X_transformed
def extract_X_y(df, label): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) if label == None: y = None else: y = df[label].values X = df.drop([label], axis=1).values return X, y
def clean_labels(df, label): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) classes = df[label].unique().tolist() def transform_labels(row, label, classes): return classes.index(row[label]) df[label] = df.apply(transform_labels, args=(label, classes), axis=1) return df
def apply_leave_one_out_encoding(df, categorical_columns, label='y'): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) import category_encoders as ce encoder = ce.LeaveOneOutEncoder(return_df=True, cols=categorical_columns).fit( df.drop([label], axis=1), df[label]) X_transformed = encoder.transform(df.drop([label], axis=1)) X_transformed[label] = df[label] return X_transformed
def plot_regression(df, cols, cat_cols, format='png'): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) import seaborn as sns import itertools for x_col, y_col in itertools.combinations(cols, 2): for cat_col in cat_cols: g = sns.lmplot(x=x_col, y=y_col, col=cat_col, data=df) plt.savefig('./tmp/images/{}_to_{}_by_{}_regression.{}'.format( to_file_save_name(x_col), to_file_save_name(y_col), to_file_save_name(cat_col), format), format=format) plt.close(plt.gcf())
def clean_numeric(df, cols): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) def is_number(s): try: float(s) return True except ValueError: return False for col in cols: df = df[df[col].apply(lambda x: is_number(x))] return df
def apply_pca(df, label=None, n_components=None): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) from sklearn.decomposition import PCA pca = PCA(n_components=n_components) if label == None: X = pd.DataFrame(pca.fit_transform(df), columns=['PCA%i' % i for i in range(n_components)], index=df.index) else: X = pd.DataFrame(pca.fit_transform(df.drop([label], axis=1)), columns=['PCA%i' % i for i in range(n_components)], index=df.index) X[label] = df[label] return X
def plot_scatter(df, cols, y, format='png'): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) if y in cols: from util.exceptions import ConfigError raise ConfigError( 'The label "{}" was also configured as a feature in cols for function "plot_scatter".' .format(y)) import seaborn as sns import itertools for x_col, y_col in itertools.combinations(cols, 2): g = sns.scatterplot(x=x_col, y=y_col, hue=y, data=df, s=10, alpha=0.4) plt.savefig('./tmp/images/{}_to_{}_by_{}_scatter.{}'.format( to_file_save_name(x_col), to_file_save_name(y_col), to_file_save_name(y), format), format=format) plt.close(plt.gcf())
def plot_boxplots(df, columns=-1, label=None, format='png'): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) if columns == -1: columns = df.columns for col in columns: fig = plt.figure() ax = fig.add_subplot(1, 1, 1) if label == None or col == label: df.boxplot(column=col, ax=ax) else: df.boxplot(column=col, by=label, ax=ax) plt.savefig('./tmp/images/{}_box.{}'.format(to_file_save_name(col), format), format=format) plt.close(fig)
def plot_cat_point(df, x_cols, y, format='png'): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) if y in x_cols: from util.exceptions import ConfigError raise ConfigError( 'The label "{}" was also configured as a feature in x_cols for function "plot_cat_point".' .format(y)) import seaborn as sns import matplotlib.ticker as ticker import itertools for x_col, hue_col in itertools.combinations(x_cols, 2): g = sns.catplot(x=x_col, y=y, hue=hue_col, kind='point', data=df) ax = g.axes[0, 0] ax.set_xticklabels(df[x_col].unique().tolist(), rotation=90) N = df[x_col].nunique() plt.xticks(range(int(N))) plt.gca().margins(x=0) plt.gcf().canvas.draw() m = 0.1 margin = m / plt.gcf().get_size_inches()[0] plt.gcf().subplots_adjust(left=margin, right=1. - margin, bottom=0.333) plt.gcf().set_size_inches(30, 7) # ax = g.axes[0,0] # N = df[col].nunique() # plt.xticks(range(N)) # plt.gca().margins(x=0) # plt.gcf().canvas.draw() # tl = plt.gca().get_xticklabels() # maxsize = max([t.get_window_extent().width for t in tl]) # m = 0.2 # inch margin # s = maxsize/plt.gcf().dpi*N+2*m # margin = m/plt.gcf().get_size_inches()[0] # # plt.gcf().subplots_adjust(left=margin, right=1.-margin) # plt.gcf().set_size_inches(s, plt.gcf().get_size_inches()[1]) plt.savefig('./tmp/images/{}_to_{}_hue_{}_cat_point.{}'.format( to_file_save_name(x_col), to_file_save_name(y), to_file_save_name(hue_col), format), format=format) plt.close(plt.gcf())
def replace_NaN_with_mean(df): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) mean = df.mean(numeric_only=True).to_dict() df.fillna(mean, inplace=True) return df
def remove_NaN_columns(df, how='any'): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) df.dropna(how=how, axis='columns', inplace=True) return df
def apply_pca(df, categorical_columns, label='y'): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) # TODO: implement return None
def apply_dummy_encoding(df): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) X_transformed = pd.get_dummies(df) return X_transformed
def drop_columns(df, cols): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) df.drop(cols, axis='columns', inplace=True) return df
def apply_standard_scaler_scaling(df): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) from sklearn.preprocessing import StandardScaler df[df.columns] = StandardScaler().fit_transform(df[df.columns]) return df