def test_split_data_2(): ''' Test to determine proportion of split is correct ''' total_length = len(data) df = clean_split_data.clean_data(data) X_train, X_test, y_train, y_test = clean_split_data.split_data(df) train_length = len(X_train) train_split = train_length / total_length assert math.isclose( train_split, 0.80, abs_tol=0.1), ("Training set is not at specified 80% of dataset") return
def test_split_data_1(): ''' Test to determine total length of datafile did not change when splitting ''' total_length = len(data) df = clean_split_data.clean_data(data) X_train, X_test, y_train, y_test = clean_split_data.split_data(df) train_length = len(X_train) test_length = len(X_test) total_split = train_length + test_length assert math.isclose( total_length, total_split), ("Length of data is not the same as before splitting") return
import pandas as pd # Import Scikit-Learn library for decision tree models from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split # Import plotting libraries import matplotlib # Set larger fontsize for all plots matplotlib.rcParams.update({'font.size': 18}) # ### Data data = pd.read_csv('data/data.csv') data = clean_data(data) X_train, X_test, y_train, y_test = split_data(data) # ### Classifier clf = DecisionTreeClassifier(max_depth=5) clf.fit(X_train, y_train) # ### Optimized Decision Tree Predictor def feature_names(): ''' Returns array of input features of best performing backwards stepwise selection test. ''' return [ 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',