forked from datarise/ml_dashboard
/
own_data.py
115 lines (81 loc) · 4.13 KB
/
own_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import streamlit as st
import pandas as pd
from pandas_profiling import ProfileReport
from streamlit_pandas_profiling import st_profile_report
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from classifications import *
@st.cache()
def load_data(csv):
df = pd.read_csv(csv)
return df
#@st.cache()
def profilling(df):
pr = ProfileReport(df, explorative=True)
st_profile_report(pr)
def create_model(df):
X_select = st.multiselect(
'Choose the features for the model: ',
df.columns)
if X_select:
st.markdown("We now need to select the preprocessing steps that are needed for each column. Feature scaling is done later on. The dashboard will automaticly impute missing values.")
preprocessing_feature = []
preprocessing_feature_method = []
for feature in X_select:
preprocess = st.multiselect(
f"Please choose the preprocessing steps that are needed for {feature}",
('Nothing', 'Label Encoding', 'One-Hot Encoding'))
if preprocess:
preprocessing_feature.append(feature)
preprocessing_feature_method.append(preprocess[0])
y_select = st.selectbox(
'Choose the target variable for den model: ',
X_select)
preprocessing_feature.append(y_select)
preprocessing_feature_method.append('Target')
X, y = preprocessing(preprocessing_feature, preprocessing_feature_method, df)
st.markdown('We will start by selecting the ratio of the traning and test set. We will leave out cross-validation for now')
test_size = st.slider('Choose the train/test raio: ', 0.0, 1.0, 0.1, 0.05)
st.markdown('We now need to scale the data. We can chosse between the standard or min-max scaler method:')
scaler = st.selectbox(
'Choose the scaler: ',
('Standard Scaler', 'Min-Max Scaler'))
st.markdown('We can now select the model we want to evaluate.')
model = st.selectbox(
'Choose the model: ',
('Logistic Regression', 'Desicion Tree', 'Random Forest', 'Support Vector Machine', 'Simple Neural Network'))
st.markdown('The model parameters now needs to be choosen:')
select_model(model, X, y, test_size, 42, scaler)
def preprocessing(preprocessing_feature, preprocessing_feature_method, df):
df_X = pd.DataFrame()
for feature, process in zip(preprocessing_feature, preprocessing_feature_method):
if process == "Label Encoding":
le = LabelEncoder()
df_X[feature] = le.fit_transform(df[feature])
elif process == "One-Hot Encoding":
df_temp = pd.get_dummies(df[feature], drop_first=True)
df_X = pd.concat([df_X, df_temp], axis=1)
else:
df_X[feature] = df[feature].values
imp = IterativeImputer(max_iter=10, verbose=0)
imp.fit(df_X)
imputed_df = imp.transform(df_X)
df_X = pd.DataFrame(imputed_df, columns=df_X.columns)
y = df_X[preprocessing_feature[-1:][0]].values.astype(np.integer)
X = df_X.drop(preprocessing_feature[-1:][0], axis=1).values
return X, y
def app():
st.title("Sklearn Playground")
st.markdown("Here can you play around classifing your own data. Start by uploading your data as a csv.")
csv = st.file_uploader("Upload your data set as csv", type="csv")
if csv:
df = load_data(csv)
st.markdown("An overview of the data.")
st.write(df)
st.markdown("A exploratory data analysis is displayed below if the buttons is clicked. It's made with Pandas Profiling and can take a while to run. You can therefor turn it of when you are done.")
show_EDA = st.radio("Show EDA", ("No", "Yes"))
if show_EDA == "Yes":
profilling(df)
st.markdown("We now need to select the features and target variable that we want to use.")
selected_variables = create_model(df)