def complete_build(x_train, x_test, y_train, y_test): #Called function post label encoding lab_stats = create_stats(x_train, x_test, y_train, y_test, enc='labelencoder') #Prepare data for one hot encoding x_train, x_test, y_train, y_test = split_dataset(df) category_index = [ x for x in range(len(df.columns)) if df[df.columns[x]].dtype == 'object' ] #one hot encoding x_train, x_test = ohe_encode(x_train, x_test, category_index) #Called function post one hot encoding ohe_stats = create_stats(x_train, x_test, y_train, y_test, enc='oheencoder') final_stats = pd.concat([lab_stats, ohe_stats], axis=0) final_stats = final_stats[['c_val', 'rmse', 'mae', 'r2']] return final_stats
def plot_corr(df, size=11): x_train, x_test, y_train, y_test = split_dataset(df) df_train = pd.concat([x_train, y_train], axis=1) corr = df_train.corr() fig, ax = subplots(figsize=(size, size)) plt.set_cmap('YlOrRd') ax.matshow(corr) xticks(range(len(corr.columns)), corr.columns, rotation=90) yticks(range(len(corr.columns)), corr.columns) fig.savefig('./images/data_image.png') return ax
# -*- coding: utf-8 -*- from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset from greyatomlib.multivariate_regression_project.q03_data_encoding.build import label_encode import matplotlib.pyplot as plt from pandas.plotting import scatter_matrix data = load_data('data/student-mat.csv') x_train, x_test, y_train, y_test = split_dataset(data) x_train, x_test = label_encode(x_train, x_test) def visualise_data(data, figname): plt.figure() scatter_matrix(data, alpha=0.2, figsize=(15, 15), diagonal='kde') #plt.savefig(figname) plt.show()
# %load q03_ohe_encoder/build.py from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset from greyatomlib.multivariate_regression_project.q03_data_encoding.build import label_encode from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelEncoder import pandas as pd path = 'data/student-mat.csv' df = load_data(path) category_index = [ x for x in range(len(df.columns)) if df[df.columns[x]].dtype == 'object' ] columns = [col for col in (df.columns) if df[col].dtype == 'object'] #print(df.shape) print(category_index) df_new = pd.get_dummies(df, columns=columns) X_train, X_test, y_train, y_test = split_dataset(df_new) def ohe_encode(X_train, X_test, defaults=category_index): X_transform, X_test_transform = label_encode(X_train, X_test) return X_transform, X_test_transform ohe_encode(X_train, X_test, category_index)
def visualise_data(data, figname): x_train, x_test, y_train, y_test = split_dataset(data) plt = scatter_matrix(data, alpha=0.2) #plt.show() #plt.figtext = figname return plt