예제 #1
0
def test_clean_data_6():
    '''
    Test to determine if diagnosis values are replaced with integers
    '''
    df = clean_split_data.clean_data(data)
    diagnosis = df.diagnosis
    for cancer in diagnosis:
        assert isinstance(cancer, int), ("Diagnosis values are not integers")

    return
예제 #2
0
def test_clean_data_4():
    '''
    Test to determine if "id" column was successfully dropped from dataframe
    '''
    df = clean_split_data.clean_data(data)
    names = df.columns
    substring = "id"
    for name in names:
        assert substring not in name, ("ID column still exists in dataframe")

    return
예제 #3
0
def test_clean_data_1():
    '''
    Test to determine if data column contains all strings
    '''
    df = clean_split_data.clean_data(data)
    names = df.columns
    for name in names:
        assert isinstance(name, str), (
            "Data columns are not all strings, there may be a filtering error")

    return
예제 #4
0
def test_clean_data_3():
    '''
    Test to determine if any of the data contains strings "_worst"
    '''
    df = clean_split_data.clean_data(data)
    names = df.columns
    substring = "_worst"
    for name in names:
        assert substring not in name, (
            "Worst measurement columns still exist in dataframe")

    return
예제 #5
0
def test_clean_data_2():
    '''
    Test to determine if any of the data contains strings "_se"
    '''
    df = clean_split_data.clean_data(data)
    names = df.columns
    substring = "_se"
    for name in names:
        assert substring not in name, (
            "Standard error columns still exist in dataframe")

    return
예제 #6
0
def test_split_data_2():
    '''
    Test to determine proportion of split is correct
    '''
    total_length = len(data)
    df = clean_split_data.clean_data(data)
    X_train, X_test, y_train, y_test = clean_split_data.split_data(df)
    train_length = len(X_train)
    train_split = train_length / total_length
    assert math.isclose(
        train_split, 0.80,
        abs_tol=0.1), ("Training set is not at specified 80% of dataset")

    return
예제 #7
0
def test_split_data_1():
    '''
    Test to determine total length of datafile did not change when splitting
    '''
    total_length = len(data)
    df = clean_split_data.clean_data(data)
    X_train, X_test, y_train, y_test = clean_split_data.split_data(df)
    train_length = len(X_train)
    test_length = len(X_test)
    total_split = train_length + test_length
    assert math.isclose(
        total_length,
        total_split), ("Length of data is not the same as before splitting")

    return
예제 #8
0
def stacking_predictor(row):
    """
    Training stacking model with our data
    Define what our base layer will be composed of and then build
    a stacking classifier base
    on these models.
    set our final estimator as "logistic regression"

    """
    our_trained_data = pd.read_csv("data/data.csv")
    our_trained_data = clean_data(our_trained_data)

    x = our_trained_data[[
        'radius_mean', 'texture_mean', 'area_mean', 'concavity_mean',
        'concave points_mean', 'symmetry_mean', 'smoothness_mean'
    ]]
    y = our_trained_data[['diagnosis']]
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    x_train = x_train.values.tolist()
    y_train = y_train.values.tolist()
    flattened_y_train = []
    for sub_list in y_train:
        for val in sub_list:
            flattened_y_train.append(val)

    X, y = x_train, flattened_y_train

    estimators = [('random_forest',
                   RandomForestClassifier(n_estimators=5, random_state=42)),
                  ('logistic_regr',
                   LogisticRegression(solver="lbfgs", max_iter=1460)),
                  ('knn', KNeighborsClassifier(n_neighbors=5)),
                  ('svm_rbf', SVC(kernel='rbf', gamma=4, C=10000))]

    Stacking_classifier = StackingClassifier(
        estimators=estimators, final_estimator=LogisticRegression(), cv=5)

    # Fit the stacking model with our own data and with selected 7 features.
    Stacking_classifier.fit(X, y)

    # Now predicting one patient
    single_predicted_result = Stacking_classifier.predict([row])

    return ('%s %d' % ("patient", single_predicted_result))
# Pandas library for the pandas dataframes
import pandas as pd

# Import Scikit-Learn library for decision tree models
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Import plotting libraries
import matplotlib

# Set larger fontsize for all plots
matplotlib.rcParams.update({'font.size': 18})

# ### Data
data = pd.read_csv('data/data.csv')
data = clean_data(data)
X_train, X_test, y_train, y_test = split_data(data)

# ### Classifier
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)


# ### Optimized Decision Tree Predictor
def feature_names():
    '''
    Returns array of input features of best
    performing backwards stepwise selection test.
    '''

    return [