def test_clean_data_6(): ''' Test to determine if diagnosis values are replaced with integers ''' df = clean_split_data.clean_data(data) diagnosis = df.diagnosis for cancer in diagnosis: assert isinstance(cancer, int), ("Diagnosis values are not integers") return
def test_clean_data_4(): ''' Test to determine if "id" column was successfully dropped from dataframe ''' df = clean_split_data.clean_data(data) names = df.columns substring = "id" for name in names: assert substring not in name, ("ID column still exists in dataframe") return
def test_clean_data_1(): ''' Test to determine if data column contains all strings ''' df = clean_split_data.clean_data(data) names = df.columns for name in names: assert isinstance(name, str), ( "Data columns are not all strings, there may be a filtering error") return
def test_clean_data_3(): ''' Test to determine if any of the data contains strings "_worst" ''' df = clean_split_data.clean_data(data) names = df.columns substring = "_worst" for name in names: assert substring not in name, ( "Worst measurement columns still exist in dataframe") return
def test_clean_data_2(): ''' Test to determine if any of the data contains strings "_se" ''' df = clean_split_data.clean_data(data) names = df.columns substring = "_se" for name in names: assert substring not in name, ( "Standard error columns still exist in dataframe") return
def test_split_data_2(): ''' Test to determine proportion of split is correct ''' total_length = len(data) df = clean_split_data.clean_data(data) X_train, X_test, y_train, y_test = clean_split_data.split_data(df) train_length = len(X_train) train_split = train_length / total_length assert math.isclose( train_split, 0.80, abs_tol=0.1), ("Training set is not at specified 80% of dataset") return
def test_split_data_1(): ''' Test to determine total length of datafile did not change when splitting ''' total_length = len(data) df = clean_split_data.clean_data(data) X_train, X_test, y_train, y_test = clean_split_data.split_data(df) train_length = len(X_train) test_length = len(X_test) total_split = train_length + test_length assert math.isclose( total_length, total_split), ("Length of data is not the same as before splitting") return
def stacking_predictor(row): """ Training stacking model with our data Define what our base layer will be composed of and then build a stacking classifier base on these models. set our final estimator as "logistic regression" """ our_trained_data = pd.read_csv("data/data.csv") our_trained_data = clean_data(our_trained_data) x = our_trained_data[[ 'radius_mean', 'texture_mean', 'area_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'smoothness_mean' ]] y = our_trained_data[['diagnosis']] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) x_train = x_train.values.tolist() y_train = y_train.values.tolist() flattened_y_train = [] for sub_list in y_train: for val in sub_list: flattened_y_train.append(val) X, y = x_train, flattened_y_train estimators = [('random_forest', RandomForestClassifier(n_estimators=5, random_state=42)), ('logistic_regr', LogisticRegression(solver="lbfgs", max_iter=1460)), ('knn', KNeighborsClassifier(n_neighbors=5)), ('svm_rbf', SVC(kernel='rbf', gamma=4, C=10000))] Stacking_classifier = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(), cv=5) # Fit the stacking model with our own data and with selected 7 features. Stacking_classifier.fit(X, y) # Now predicting one patient single_predicted_result = Stacking_classifier.predict([row]) return ('%s %d' % ("patient", single_predicted_result))
# Pandas library for the pandas dataframes import pandas as pd # Import Scikit-Learn library for decision tree models from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split # Import plotting libraries import matplotlib # Set larger fontsize for all plots matplotlib.rcParams.update({'font.size': 18}) # ### Data data = pd.read_csv('data/data.csv') data = clean_data(data) X_train, X_test, y_train, y_test = split_data(data) # ### Classifier clf = DecisionTreeClassifier(max_depth=5) clf.fit(X_train, y_train) # ### Optimized Decision Tree Predictor def feature_names(): ''' Returns array of input features of best performing backwards stepwise selection test. ''' return [